@@ -70,11 +70,14 @@ impl CommonExtractor {
7070 while let Some ( m) = matches. next ( ) {
7171 for capture in m. captures {
7272 let node = capture. node ;
73- let start = node. start_byte ( ) ;
74- let end = node. end_byte ( ) ;
73+ let start_byte = node. start_byte ( ) ;
74+ let end_byte = node. end_byte ( ) ;
7575
7676 if Self :: is_valid_comment_node ( node, language) {
77- comment_ranges. push ( ( start, end) ) ;
77+ // Convert byte positions to character positions
78+ let start_char = Self :: byte_to_char_position ( source_code, start_byte) ;
79+ let end_char = Self :: byte_to_char_position ( source_code, end_byte) ;
80+ comment_ranges. push ( ( start_char, end_char) ) ;
7881 }
7982 }
8083 }
@@ -83,6 +86,13 @@ impl CommonExtractor {
8386 Ok ( comment_ranges)
8487 }
8588
89+ /// Convert byte position to character position in the given string
90+ fn byte_to_char_position ( source_code : & str , byte_pos : usize ) -> usize {
91+ source_code[ ..byte_pos. min ( source_code. len ( ) ) ]
92+ . chars ( )
93+ . count ( )
94+ }
95+
8696 fn is_valid_comment_node ( node : Node , language : & str ) -> bool {
8797 let node_kind = node. kind ( ) ;
8898 match language {
@@ -111,22 +121,27 @@ impl CommonExtractor {
111121 file_path : & Path ,
112122 language : & str ,
113123 capture_name : & str ,
114- file_comment_ranges : & [ ( usize , usize ) ] ,
124+ file_comment_ranges : & [ ( usize , usize ) ] , // Already in character positions
115125 ) -> Option < CodeChunk > {
116126 let start_byte = node. start_byte ( ) ;
117127 let end_byte = node. end_byte ( ) ;
118128 let content = & source_code[ start_byte..end_byte] ;
119129
130+ // Convert byte positions to character positions to match file_comment_ranges
131+ let start_char = Self :: byte_to_char_position ( source_code, start_byte) ;
132+ let end_char = Self :: byte_to_char_position ( source_code, end_byte) ;
133+
120134 let start_line = node. start_position ( ) . row + 1 ;
121135 let end_line = node. end_position ( ) . row + 1 ;
122- let original_indentation = node. start_position ( ) . column ;
136+ let original_indentation_bytes = node. start_position ( ) . column ;
123137
124138 // Extract actual indentation characters from source
125- let original_indent_chars = if original_indentation > 0 {
126- Self :: extract_line_indent_chars (
139+ // Note: original_indentation is in byte units from TreeSitter, but we need char units
140+ let original_indent_chars = if original_indentation_bytes > 0 {
141+ Self :: extract_line_indent_chars_corrected (
127142 source_code,
128143 node. start_position ( ) . row ,
129- original_indentation ,
144+ original_indentation_bytes ,
130145 )
131146 } else {
132147 String :: new ( )
@@ -142,24 +157,42 @@ impl CommonExtractor {
142157 . or_else ( || Self :: extract_name ( node, source_code) )
143158 . unwrap_or_else ( || "unknown" . to_string ( ) ) ;
144159
160+ let normalized_content =
161+ Self :: normalize_first_line_indentation ( content, & original_indent_chars) ;
162+
163+ // Simple position calculation:
164+ // code_start_pos = start_char (TreeSitter chunk の行頭)
165+ // chunk_start_pos = original_indentation (node.start_position().column)
166+ // comment_start_pos = comment生pos - code_start_pos
167+
168+ // Adjust comment ranges to be relative to the normalized content.
169+ // Note:
170+ // - file_comment_ranges are character-based positions for the whole file
171+ // - We first convert them to chunk-relative character positions
172+ // - Then we add the first-line indentation characters we injected at the very
173+ // beginning of the normalized content, so display-time positions match
174+ let indent_offset_chars = original_indent_chars. chars ( ) . count ( ) ;
175+
145176 let chunk_comment_ranges: Vec < ( usize , usize ) > = file_comment_ranges
146177 . iter ( )
147- . filter_map ( |& ( comment_start, comment_end) | {
148- if comment_start >= start_byte && comment_end <= end_byte {
149- Some ( ( comment_start - start_byte, comment_end - start_byte) )
178+ . filter_map ( |& ( comment_raw_pos_start, comment_raw_pos_end) | {
179+ // Check if comment is within this chunk's boundaries
180+ if comment_raw_pos_start >= start_char && comment_raw_pos_end <= end_char {
181+ // Convert to chunk-relative positions
182+ let comment_start_pos = comment_raw_pos_start - start_char;
183+ let comment_end_pos = comment_raw_pos_end - start_char;
184+
185+ // Account for added indentation at the very start of normalized content
186+ let adjusted_start = comment_start_pos + indent_offset_chars;
187+ let adjusted_end = comment_end_pos + indent_offset_chars;
188+
189+ Some ( ( adjusted_start, adjusted_end) )
150190 } else {
151191 None
152192 }
153193 } )
154194 . collect ( ) ;
155195
156- let ( normalized_content, normalized_comment_ranges) = Self :: normalize_indentation (
157- content,
158- original_indentation,
159- & original_indent_chars,
160- & chunk_comment_ranges,
161- ) ;
162-
163196 Some ( CodeChunk {
164197 content : normalized_content,
165198 file_path : file_path. to_path_buf ( ) ,
@@ -168,8 +201,9 @@ impl CommonExtractor {
168201 language : language. to_string ( ) ,
169202 chunk_type,
170203 name,
171- comment_ranges : normalized_comment_ranges,
172- original_indentation,
204+ comment_ranges : chunk_comment_ranges,
205+ // Store indentation as character count to keep extractor outputs character-based
206+ original_indentation : indent_offset_chars,
173207 } )
174208 }
175209
@@ -209,111 +243,43 @@ impl CommonExtractor {
209243 None
210244 }
211245
212- fn normalize_indentation (
213- content : & str ,
214- original_indentation : usize ,
215- original_indent_chars : & str ,
216- comment_ranges : & [ ( usize , usize ) ] ,
217- ) -> ( String , Vec < ( usize , usize ) > ) {
246+ fn normalize_first_line_indentation ( content : & str , original_indent_chars : & str ) -> String {
218247 let lines: Vec < & str > = content. lines ( ) . collect ( ) ;
219248 if lines. is_empty ( ) {
220- return ( content. to_string ( ) , comment_ranges . to_vec ( ) ) ;
249+ return content. to_string ( ) ;
221250 }
222251
223- let mut position_map = Vec :: new ( ) ;
224- let mut normalized_lines = Vec :: new ( ) ;
225- let mut _original_pos = 0 ;
226- let mut normalized_pos = 0 ;
252+ let mut result_lines = Vec :: new ( ) ;
227253
228254 for ( line_idx, line) in lines. iter ( ) . enumerate ( ) {
229- let line_chars: Vec < char > = line. chars ( ) . collect ( ) ;
230-
231255 if line_idx == 0 {
232256 // First line: add original indentation characters from source
233- let normalized_line = format ! ( "{}{}" , original_indent_chars, line) ;
234-
235- // Map positions: indent chars are not mapped to original content, original content is mapped
236- for _ in original_indent_chars. chars ( ) {
237- position_map. push ( Some ( normalized_pos) ) ;
238- normalized_pos += 1 ;
239- }
240- for _ in & line_chars {
241- position_map. push ( Some ( normalized_pos) ) ;
242- normalized_pos += 1 ;
243- _original_pos += 1 ;
244- }
245- normalized_lines. push ( normalized_line) ;
246- } else if line. trim ( ) . is_empty ( ) {
247- for _ in & line_chars {
248- position_map. push ( None ) ;
249- _original_pos += 1 ;
250- }
251- normalized_lines. push ( String :: new ( ) ) ;
257+ result_lines. push ( format ! ( "{}{}" , original_indent_chars, line) ) ;
252258 } else {
253- let current_indent = line. len ( ) - line. trim_start ( ) . len ( ) ;
254- if current_indent >= original_indentation {
255- for i in 0 ..line_chars. len ( ) {
256- if i < original_indentation {
257- position_map. push ( None ) ;
258- } else {
259- position_map. push ( Some ( normalized_pos) ) ;
260- normalized_pos += 1 ;
261- }
262- _original_pos += 1 ;
263- }
264- normalized_lines. push ( line[ original_indentation..] . to_string ( ) ) ;
265- } else {
266- for _ in & line_chars {
267- position_map. push ( Some ( normalized_pos) ) ;
268- normalized_pos += 1 ;
269- _original_pos += 1 ;
270- }
271- normalized_lines. push ( line. to_string ( ) ) ;
272- }
273- }
274-
275- if line_idx < lines. len ( ) - 1 {
276- position_map. push ( Some ( normalized_pos) ) ;
277- normalized_pos += 1 ;
278- _original_pos += 1 ;
279- }
280- }
281-
282- let normalized_text = normalized_lines. join ( "\n " ) ;
283- let mut final_ranges = Vec :: new ( ) ;
284-
285- for & ( orig_start, orig_end) in comment_ranges {
286- if orig_start < position_map. len ( ) && orig_end <= position_map. len ( ) {
287- let norm_start = position_map. get ( orig_start) . and_then ( |& pos| pos) ;
288- let norm_end = if orig_end > 0 && orig_end <= position_map. len ( ) {
289- ( 0 ..orig_end)
290- . rev ( )
291- . find_map ( |i| position_map. get ( i) . and_then ( |& pos| pos) )
292- . map ( |pos| pos + 1 )
293- } else {
294- None
295- } ;
296-
297- if let ( Some ( start) , Some ( end) ) = ( norm_start, norm_end) {
298- if start < end && end <= normalized_text. len ( ) {
299- final_ranges. push ( ( start, end) ) ;
300- }
301- }
259+ // Other lines: keep as is
260+ result_lines. push ( line. to_string ( ) ) ;
302261 }
303262 }
304263
305- ( normalized_text , final_ranges )
264+ result_lines . join ( " \n " )
306265 }
307266
308- fn extract_line_indent_chars (
267+ pub fn extract_line_indent_chars_corrected (
309268 source_code : & str ,
310269 line_row : usize ,
311- indent_length : usize ,
270+ indent_byte_length : usize ,
312271 ) -> String {
313272 let lines: Vec < & str > = source_code. lines ( ) . collect ( ) ;
314273 if line_row < lines. len ( ) {
315274 let line = lines[ line_row] ;
316- line. chars ( ) . take ( indent_length) . collect ( )
275+ // Convert byte position to character position first
276+ if indent_byte_length <= line. len ( ) {
277+ let indent_char_count = line[ ..indent_byte_length] . chars ( ) . count ( ) ;
278+ line. chars ( ) . take ( indent_char_count) . collect ( )
279+ } else {
280+ // If byte length exceeds line length, take all characters
281+ line. to_string ( )
282+ }
317283 } else {
318284 String :: new ( )
319285 }
0 commit comments