@@ -70,11 +70,14 @@ impl CommonExtractor {
7070 while let Some ( m) = matches. next ( ) {
7171 for capture in m. captures {
7272 let node = capture. node ;
73- let start = node. start_byte ( ) ;
74- let end = node. end_byte ( ) ;
73+ let start_byte = node. start_byte ( ) ;
74+ let end_byte = node. end_byte ( ) ;
7575
7676 if Self :: is_valid_comment_node ( node, language) {
77- comment_ranges. push ( ( start, end) ) ;
77+ // Convert byte positions to character positions
78+ let start_char = Self :: byte_to_char_position ( source_code, start_byte) ;
79+ let end_char = Self :: byte_to_char_position ( source_code, end_byte) ;
80+ comment_ranges. push ( ( start_char, end_char) ) ;
7881 }
7982 }
8083 }
@@ -83,6 +86,13 @@ impl CommonExtractor {
8386 Ok ( comment_ranges)
8487 }
8588
89+ /// Convert byte position to character position in the given string
90+ fn byte_to_char_position ( source_code : & str , byte_pos : usize ) -> usize {
91+ source_code[ ..byte_pos. min ( source_code. len ( ) ) ]
92+ . chars ( )
93+ . count ( )
94+ }
95+
8696 fn is_valid_comment_node ( node : Node , language : & str ) -> bool {
8797 let node_kind = node. kind ( ) ;
8898 match language {
@@ -111,22 +121,27 @@ impl CommonExtractor {
111121 file_path : & Path ,
112122 language : & str ,
113123 capture_name : & str ,
114- file_comment_ranges : & [ ( usize , usize ) ] ,
124+ file_comment_ranges : & [ ( usize , usize ) ] , // Already in character positions
115125 ) -> Option < CodeChunk > {
116126 let start_byte = node. start_byte ( ) ;
117127 let end_byte = node. end_byte ( ) ;
118128 let content = & source_code[ start_byte..end_byte] ;
119129
130+ // Convert byte positions to character positions to match file_comment_ranges
131+ let start_char = Self :: byte_to_char_position ( source_code, start_byte) ;
132+ let end_char = Self :: byte_to_char_position ( source_code, end_byte) ;
133+
120134 let start_line = node. start_position ( ) . row + 1 ;
121135 let end_line = node. end_position ( ) . row + 1 ;
122- let original_indentation = node. start_position ( ) . column ;
136+ let original_indentation_bytes = node. start_position ( ) . column ;
123137
124138 // Extract actual indentation characters from source
125- let original_indent_chars = if original_indentation > 0 {
126- Self :: extract_line_indent_chars (
139+ // Note: original_indentation is in byte units from TreeSitter, but we need char units
140+ let original_indent_chars = if original_indentation_bytes > 0 {
141+ Self :: extract_line_indent_chars_corrected (
127142 source_code,
128143 node. start_position ( ) . row ,
129- original_indentation ,
144+ original_indentation_bytes ,
130145 )
131146 } else {
132147 String :: new ( )
@@ -142,22 +157,42 @@ impl CommonExtractor {
142157 . or_else ( || Self :: extract_name ( node, source_code) )
143158 . unwrap_or_else ( || "unknown" . to_string ( ) ) ;
144159
160+ let normalized_content =
161+ Self :: normalize_first_line_indentation ( content, & original_indent_chars) ;
162+
163+ // Simple position calculation:
164+ // code_start_pos = start_char (TreeSitter chunk の行頭)
165+ // chunk_start_pos = original_indentation (node.start_position().column)
166+ // comment_start_pos = comment生pos - code_start_pos
167+
168+ // Adjust comment ranges to be relative to the normalized content.
169+ // Note:
170+ // - file_comment_ranges are character-based positions for the whole file
171+ // - We first convert them to chunk-relative character positions
172+ // - Then we add the first-line indentation characters we injected at the very
173+ // beginning of the normalized content, so display-time positions match
174+ let indent_offset_chars = original_indent_chars. chars ( ) . count ( ) ;
175+
145176 let chunk_comment_ranges: Vec < ( usize , usize ) > = file_comment_ranges
146177 . iter ( )
147- . filter_map ( |& ( comment_start, comment_end) | {
148- if comment_start >= start_byte && comment_end <= end_byte {
149- Some ( ( comment_start - start_byte, comment_end - start_byte) )
178+ . filter_map ( |& ( comment_raw_pos_start, comment_raw_pos_end) | {
179+ // Check if comment is within this chunk's boundaries
180+ if comment_raw_pos_start >= start_char && comment_raw_pos_end <= end_char {
181+ // Convert to chunk-relative positions
182+ let comment_start_pos = comment_raw_pos_start - start_char;
183+ let comment_end_pos = comment_raw_pos_end - start_char;
184+
185+ // Account for added indentation at the very start of normalized content
186+ let adjusted_start = comment_start_pos + indent_offset_chars;
187+ let adjusted_end = comment_end_pos + indent_offset_chars;
188+
189+ Some ( ( adjusted_start, adjusted_end) )
150190 } else {
151191 None
152192 }
153193 } )
154194 . collect ( ) ;
155195
156- let normalized_content = Self :: normalize_first_line_indentation (
157- content,
158- & original_indent_chars,
159- ) ;
160-
161196 Some ( CodeChunk {
162197 content : normalized_content,
163198 file_path : file_path. to_path_buf ( ) ,
@@ -167,7 +202,8 @@ impl CommonExtractor {
167202 chunk_type,
168203 name,
169204 comment_ranges : chunk_comment_ranges,
170- original_indentation,
205+ // Store indentation as character count to keep extractor outputs character-based
206+ original_indentation : indent_offset_chars,
171207 } )
172208 }
173209
@@ -207,17 +243,14 @@ impl CommonExtractor {
207243 None
208244 }
209245
210- fn normalize_first_line_indentation (
211- content : & str ,
212- original_indent_chars : & str ,
213- ) -> String {
246+ fn normalize_first_line_indentation ( content : & str , original_indent_chars : & str ) -> String {
214247 let lines: Vec < & str > = content. lines ( ) . collect ( ) ;
215248 if lines. is_empty ( ) {
216249 return content. to_string ( ) ;
217250 }
218251
219252 let mut result_lines = Vec :: new ( ) ;
220-
253+
221254 for ( line_idx, line) in lines. iter ( ) . enumerate ( ) {
222255 if line_idx == 0 {
223256 // First line: add original indentation characters from source
@@ -231,15 +264,22 @@ impl CommonExtractor {
231264 result_lines. join ( "\n " )
232265 }
233266
234- fn extract_line_indent_chars (
267+ pub fn extract_line_indent_chars_corrected (
235268 source_code : & str ,
236269 line_row : usize ,
237- indent_length : usize ,
270+ indent_byte_length : usize ,
238271 ) -> String {
239272 let lines: Vec < & str > = source_code. lines ( ) . collect ( ) ;
240273 if line_row < lines. len ( ) {
241274 let line = lines[ line_row] ;
242- line. chars ( ) . take ( indent_length) . collect ( )
275+ // Convert byte position to character position first
276+ if indent_byte_length <= line. len ( ) {
277+ let indent_char_count = line[ ..indent_byte_length] . chars ( ) . count ( ) ;
278+ line. chars ( ) . take ( indent_char_count) . collect ( )
279+ } else {
280+ // If byte length exceeds line length, take all characters
281+ line. to_string ( )
282+ }
243283 } else {
244284 String :: new ( )
245285 }
0 commit comments