aboutsummaryrefslogtreecommitdiffstats
path: root/src/buffer.h
diff options
context:
space:
mode:
authorYuan Fu2025-03-18 17:26:26 -0700
committerYuan Fu2025-05-03 22:14:03 -0700
commit1897da0b599cc3ea1e4aa626e47ac8943a7b6833 (patch)
treefef9034fb01a0883e5874923ad756b554b49cffd /src/buffer.h
parent159e3a981ed5482393182b036e38818d42405c90 (diff)
downloademacs-1897da0b599cc3ea1e4aa626e47ac8943a7b6833.tar.gz
emacs-1897da0b599cc3ea1e4aa626e47ac8943a7b6833.zip
Add line-column tracking for tree-sitter
Add line-column tracking for tree-sitter parsers. Copied from comments in treesit.c: Technically we had to send tree-sitter the line and column position of each edit. But in practice we just send it dummy values, because tree-sitter doesn't use it for parsing and mostly just carries the line and column positions around and return it when e.g. reporting node positions[1]. This has been working fine until we encountered grammars that actually utilizes the line and column information for parsing (Haskell)[2]. [1] https://github.com/tree-sitter/tree-sitter/issues/445 [2] https://github.com/tree-sitter/tree-sitter/issues/4001 So now we have to keep track of line and column positions and pass valid values to tree-sitter. (It adds quite some complexity, but only linearly; one can ignore all the linecol stuff when trying to understand treesit code and then come back to it later.) Eli convinced me to disable tracking by default, and only enable it for languages that needs it. So the buffer starts out not tracking linecol. And when a parser is created, if the language is in treesit-languages-require-line-column-tracking, we enable tracking in the buffer, and enable tracking for the parser. To simplify things, once a buffer starts tracking linecol, it never disables tracking, even if parsers that need tracking are all deleted; and for parsers, tracking is determined at creation time, if it starts out tracking/non-tracking, it stays that way, regardless of later changes to treesit-languages-require-line-column-tracking. To make calculating line/column positons fast, we store linecol caches for begv, point, and zv in the buffer (buf->ts_linecol_cache_xxx); and in the parser object, we store linecol cache for visible beg/end of that parser. In buffer editing functions, we need the linecol for start/old_end/new_end, those can be calculated by scanning newlines (treesit_linecol_of_pos) from the buffer point cache, which should be always near the point. And we usually set the calculated linecol of new_end back to the buffer point cache. We also need to calculate linecol for the visible_beg/end for each parser, and linecol for the buffer's begv/zv, these positions are usually far from point, so we have caches for all of them (in either the parser object or the buffer). These positions are far from point, so it's inefficient to scan newlines from point to there to get up-to-date linecol for them; but in the same time, because they're far and outside the changed region, we can calculate their change in line and column number by simply counting how much newlines are added/removed in the changed region (compute_new_linecol_by_change). * doc/lispref/parsing.texi (Using Parser): Mention line-column tracking in manual. * etc/NEWS: Add news. * lisp/treesit.el: (treesit-languages-need-line-column-tracking): New variable. * src/buffer.c: Include treesit.h (for TREESIT_EMPTY_LINECOL). (Fget_buffer_create): (Fmake_indirect_buffer): Initialize new buffer fields. (Fbuffer_swap_text): Add new buffer fields. * src/buffer.h (ts_linecol): New struct. (buffer): New buffer fields. (BUF_TS_LINECOL_BEGV): (BUF_TS_LINECOL_POINT): (BUF_TS_LINECOL_ZV): (SET_BUF_TS_LINECOL_BEGV): (SET_BUF_TS_LINECOL_POINT): (SET_BUF_TS_LINECOL_ZV): New inline functions. * src/casefiddle.c (casify_region): Record linecol info. * src/editfns.c (Fsubst_char_in_region): (Ftranslate_region_internal): (Ftranspose_regions): Record linecol info. * src/insdel.c (insert_1_both): (insert_from_string_1): (insert_from_gap_1): (insert_from_buffer): (replace_range): (del_range_2): Record linecol info. * src/treesit.c (TREESIT_BOB_LINECOL): (TREESIT_EMPTY_LINECOL): (TREESIT_TS_POINT_1_0): New constants. (treesit_debug_print_linecol): (treesit_buf_tracks_linecol_p): (restore_restriction_and_selective_display): (treesit_count_lines): (treesit_debug_validate_linecol): (treesit_linecol_of_pos): (treesit_make_ts_point): (Ftreesit_tracking_line_column_p): (Ftreesit_parser_tracking_line_column_p): New functions. (treesit_tree_edit_1): Accept real TSPoint and pass to tree-sitter. (compute_new_linecol_by_change): New function. (treesit_record_change_1): Rename from treesit_record_change, handle linecol if tracking is enabled. (treesit_linecol_maybe): New function. (treesit_record_change): New wrapper around treesit_record_change_1 that handles some boilerplate and sets buffer state. (treesit_sync_visible_region): Handle linecol if tracking is enabled. (make_treesit_parser): Setup parser's linecol cache if tracking is enabled. (Ftreesit_parser_create): Enable tracking if the parser's language requires it. (Ftreesit__linecol_at): (Ftreesit__linecol_cache_set): (Ftreesit__linecol_cache): New functions for debugging and testing. (syms_of_treesit): New variable Vtreesit_languages_require_line_column_tracking. * src/treesit.h (Lisp_TS_Parser): New fields. (TREESIT_BOB_LINECOL): (TREESIT_EMPTY_LINECOL): New constants. * test/src/treesit-tests.el (treesit-linecol-basic): (treesit-linecol-search-back-across-newline): (treesit-linecol-col-same-line): (treesit-linecol-enable-disable): New tests. * src/lisp.h: Declare display_count_lines. * src/xdisp.c (display_count_lines): Remove static keyword.
Diffstat (limited to 'src/buffer.h')
-rw-r--r--src/buffer.h72
1 files changed, 72 insertions, 0 deletions
diff --git a/src/buffer.h b/src/buffer.h
index d19ff22babd..26a334ea810 100644
--- a/src/buffer.h
+++ b/src/buffer.h
@@ -220,6 +220,20 @@ extern ptrdiff_t advance_to_char_boundary (ptrdiff_t byte_pos);
220 220
221/* Define the actual buffer data structures. */ 221/* Define the actual buffer data structures. */
222 222
223/* This data structure stores the cache of a position and its line and
224 column number. The column number is counted in bytes. The line
225 number and column number don't respect narrowing. */
226struct ts_linecol
227{
228 /* The byte position. */
229 ptrdiff_t bytepos;
230 /* The line number of this position. */
231 ptrdiff_t line;
232 /* The column number (in bytes) of this position (0-based). Basically
233 the byte offset from BOL (or BOB). */
234 ptrdiff_t col;
235};
236
223/* This data structure describes the actual text contents of a buffer. 237/* This data structure describes the actual text contents of a buffer.
224 It is shared between indirect buffers and their base buffer. */ 238 It is shared between indirect buffers and their base buffer. */
225 239
@@ -700,6 +714,25 @@ struct buffer
700 /* The interval tree containing this buffer's overlays. */ 714 /* The interval tree containing this buffer's overlays. */
701 struct itree_tree *overlays; 715 struct itree_tree *overlays;
702 716
717 /* Right now only tree-sitter makes use of this, so I don't want
718 non-tree-sitter build to pay for it. If something else can make
719 use of this, we can remove the gate. */
720#ifdef HAVE_TREE_SITTER
721 /* Cache of line and column number of a position. Tree-sitter uses
722 this cache to calculate line and column of the beginning and end of
723 buffer edits. Stores three caches for BEGV, point, ZV,
724 respectively. All three are refreshed in buffer edit functions, so
725 they're always up-to-date (in the sense that the bytepos and
726 line/column number are in sync, not in the sense that the bytepos
727 is at the actual position of point/BEGV/ZV, indeed, most of the
728 time the bytepos is only near the actual position). All caches are
729 initialized to empty, meaning no linecol tracking for this
730 buffer. */
731 struct ts_linecol ts_linecol_begv;
732 struct ts_linecol ts_linecol_point;
733 struct ts_linecol ts_linecol_zv;
734#endif
735
703 /* Changes in the buffer are recorded here for undo, and t means 736 /* Changes in the buffer are recorded here for undo, and t means
704 don't record anything. This information belongs to the base 737 don't record anything. This information belongs to the base
705 buffer of an indirect buffer. But we can't store it in the 738 buffer of an indirect buffer. But we can't store it in the
@@ -1134,6 +1167,45 @@ BUFFER_CHECK_INDIRECTION (struct buffer *b)
1134 } 1167 }
1135} 1168}
1136 1169
1170#ifdef HAVE_TREE_SITTER
1171
1172INLINE struct ts_linecol
1173BUF_TS_LINECOL_BEGV (struct buffer *buf)
1174{
1175 return buf->ts_linecol_begv;
1176}
1177INLINE struct ts_linecol
1178BUF_TS_LINECOL_POINT (struct buffer *buf)
1179{
1180 return buf->ts_linecol_point;
1181}
1182
1183INLINE struct ts_linecol
1184BUF_TS_LINECOL_ZV (struct buffer *buf)
1185{
1186 return buf->ts_linecol_zv;
1187}
1188
1189INLINE void
1190SET_BUF_TS_LINECOL_BEGV (struct buffer *buf, struct ts_linecol linecol)
1191{
1192 buf->ts_linecol_begv = linecol;
1193}
1194
1195INLINE void
1196SET_BUF_TS_LINECOL_POINT (struct buffer *buf, struct ts_linecol linecol)
1197{
1198 buf->ts_linecol_point = linecol;
1199}
1200
1201INLINE void
1202SET_BUF_TS_LINECOL_ZV (struct buffer *buf, struct ts_linecol linecol)
1203{
1204 buf->ts_linecol_zv = linecol;
1205}
1206
1207#endif
1208
1137/* This structure holds the default values of the buffer-local variables 1209/* This structure holds the default values of the buffer-local variables
1138 that have special slots in each buffer. 1210 that have special slots in each buffer.
1139 The default value occupies the same slot in this structure 1211 The default value occupies the same slot in this structure