Skip to content

Commit df16f30

Browse files
committed
fix(extractor,typing): align comment positions to char-based indices and fix display offset after indent normalization
Unify extractor outputs to character-based positions. Adjust chunk-relative comment ranges by the first-line indent added during normalization. In TypingCore, normalize incoming ranges (bytes->chars when needed) to ensure correct display mapping. Keep downstream logic behaviorally compatible to avoid snapshot regressions.
1 parent 5b4aa22 commit df16f30

File tree

42 files changed

+1753
-537
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1753
-537
lines changed

examples/debug_comment.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
use gittype::game::typing_core::{ProcessingOptions, TypingCore};
2+
3+
fn main() {
4+
let code = r#"fn test() {
5+
// Path symbols: ../.. and ./path and ~/home
6+
// Unicode arrows: → ← ↑ ↓ and ↵ symbol
7+
// Mixed symbols: ../../config.json → ~/.config/
8+
let x = 42;
9+
}"#;
10+
11+
let comment_ranges = vec![
12+
(
13+
code.find("// Path symbols").unwrap(),
14+
code.find("~/home").unwrap() + "~/home".len(),
15+
),
16+
(
17+
code.find("// Unicode arrows").unwrap(),
18+
code.find("↵ symbol").unwrap() + "↵ symbol".len(),
19+
),
20+
(
21+
code.find("// Mixed symbols").unwrap(),
22+
code.find("~/.config/").unwrap() + "~/.config/".len(),
23+
),
24+
];
25+
26+
println!(
27+
"code.len bytes={}, chars={}",
28+
code.len(),
29+
code.chars().count()
30+
);
31+
println!("comment_ranges (bytes): {:?}", comment_ranges);
32+
33+
let typing_core = TypingCore::new(code, &comment_ranges, ProcessingOptions::default());
34+
let display = typing_core.text_to_display().to_string();
35+
println!(
36+
"display bytes={}, chars={}",
37+
display.len(),
38+
display.chars().count()
39+
);
40+
println!("display: {}", display);
41+
let ranges = typing_core.display_comment_ranges();
42+
println!("display_ranges: {:?}", ranges);
43+
for (i, (s, e)) in ranges.iter().enumerate() {
44+
let frag = &display[*s..*e];
45+
println!("[{}] {:?}", i, frag);
46+
}
47+
48+
// Show how we convert bytes->chars for original ranges
49+
let to_char = |b: usize| code[..b.min(code.len())].chars().count();
50+
let converted: Vec<(usize, usize)> = comment_ranges
51+
.iter()
52+
.map(|&(s, e)| (to_char(s), to_char(e)))
53+
.collect();
54+
println!("converted char ranges: {:?}", converted);
55+
let code_chars: Vec<char> = code.chars().collect();
56+
for (i, (cs, ce)) in converted.iter().copied().enumerate() {
57+
let text: String = code_chars[cs..ce].iter().collect();
58+
println!("orig[{}]: {:?} (chars {})", i, text, text.chars().count());
59+
}
60+
61+
// Show line char starts
62+
let mut acc = 0usize;
63+
for (i, line) in code.lines().enumerate() {
64+
let start = acc;
65+
let end = start + line.chars().count();
66+
println!("line {} chars {}..{} => {:?}", i + 1, start, end, line);
67+
acc = end + 1; // account for \n
68+
}
69+
}

src/extractor/challenge_converter.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ impl ChallengeConverter {
190190
// Check if truncated content meets minimum requirements
191191
let adjusted_comment_ranges = self.adjust_comment_ranges_for_truncation(
192192
&chunk.comment_ranges,
193-
truncated_content.len(),
193+
truncated_content.chars().count(),
194194
);
195195
let truncated_code_chars =
196196
self.count_code_characters(truncated_content, &adjusted_comment_ranges);

src/extractor/core/extractor.rs

Lines changed: 65 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,14 @@ impl CommonExtractor {
7070
while let Some(m) = matches.next() {
7171
for capture in m.captures {
7272
let node = capture.node;
73-
let start = node.start_byte();
74-
let end = node.end_byte();
73+
let start_byte = node.start_byte();
74+
let end_byte = node.end_byte();
7575

7676
if Self::is_valid_comment_node(node, language) {
77-
comment_ranges.push((start, end));
77+
// Convert byte positions to character positions
78+
let start_char = Self::byte_to_char_position(source_code, start_byte);
79+
let end_char = Self::byte_to_char_position(source_code, end_byte);
80+
comment_ranges.push((start_char, end_char));
7881
}
7982
}
8083
}
@@ -83,6 +86,13 @@ impl CommonExtractor {
8386
Ok(comment_ranges)
8487
}
8588

89+
/// Convert byte position to character position in the given string
90+
fn byte_to_char_position(source_code: &str, byte_pos: usize) -> usize {
91+
source_code[..byte_pos.min(source_code.len())]
92+
.chars()
93+
.count()
94+
}
95+
8696
fn is_valid_comment_node(node: Node, language: &str) -> bool {
8797
let node_kind = node.kind();
8898
match language {
@@ -111,22 +121,27 @@ impl CommonExtractor {
111121
file_path: &Path,
112122
language: &str,
113123
capture_name: &str,
114-
file_comment_ranges: &[(usize, usize)],
124+
file_comment_ranges: &[(usize, usize)], // Already in character positions
115125
) -> Option<CodeChunk> {
116126
let start_byte = node.start_byte();
117127
let end_byte = node.end_byte();
118128
let content = &source_code[start_byte..end_byte];
119129

130+
// Convert byte positions to character positions to match file_comment_ranges
131+
let start_char = Self::byte_to_char_position(source_code, start_byte);
132+
let end_char = Self::byte_to_char_position(source_code, end_byte);
133+
120134
let start_line = node.start_position().row + 1;
121135
let end_line = node.end_position().row + 1;
122-
let original_indentation = node.start_position().column;
136+
let original_indentation_bytes = node.start_position().column;
123137

124138
// Extract actual indentation characters from source
125-
let original_indent_chars = if original_indentation > 0 {
126-
Self::extract_line_indent_chars(
139+
// Note: original_indentation is in byte units from TreeSitter, but we need char units
140+
let original_indent_chars = if original_indentation_bytes > 0 {
141+
Self::extract_line_indent_chars_corrected(
127142
source_code,
128143
node.start_position().row,
129-
original_indentation,
144+
original_indentation_bytes,
130145
)
131146
} else {
132147
String::new()
@@ -142,22 +157,42 @@ impl CommonExtractor {
142157
.or_else(|| Self::extract_name(node, source_code))
143158
.unwrap_or_else(|| "unknown".to_string());
144159

160+
let normalized_content =
161+
Self::normalize_first_line_indentation(content, &original_indent_chars);
162+
163+
// Simple position calculation:
164+
// code_start_pos = start_char (TreeSitter chunk の行頭)
165+
// chunk_start_pos = original_indentation (node.start_position().column)
166+
// comment_start_pos = comment生pos - code_start_pos
167+
168+
// Adjust comment ranges to be relative to the normalized content.
169+
// Note:
170+
// - file_comment_ranges are character-based positions for the whole file
171+
// - We first convert them to chunk-relative character positions
172+
// - Then we add the first-line indentation characters we injected at the very
173+
// beginning of the normalized content, so display-time positions match
174+
let indent_offset_chars = original_indent_chars.chars().count();
175+
145176
let chunk_comment_ranges: Vec<(usize, usize)> = file_comment_ranges
146177
.iter()
147-
.filter_map(|&(comment_start, comment_end)| {
148-
if comment_start >= start_byte && comment_end <= end_byte {
149-
Some((comment_start - start_byte, comment_end - start_byte))
178+
.filter_map(|&(comment_raw_pos_start, comment_raw_pos_end)| {
179+
// Check if comment is within this chunk's boundaries
180+
if comment_raw_pos_start >= start_char && comment_raw_pos_end <= end_char {
181+
// Convert to chunk-relative positions
182+
let comment_start_pos = comment_raw_pos_start - start_char;
183+
let comment_end_pos = comment_raw_pos_end - start_char;
184+
185+
// Account for added indentation at the very start of normalized content
186+
let adjusted_start = comment_start_pos + indent_offset_chars;
187+
let adjusted_end = comment_end_pos + indent_offset_chars;
188+
189+
Some((adjusted_start, adjusted_end))
150190
} else {
151191
None
152192
}
153193
})
154194
.collect();
155195

156-
let normalized_content = Self::normalize_first_line_indentation(
157-
content,
158-
&original_indent_chars,
159-
);
160-
161196
Some(CodeChunk {
162197
content: normalized_content,
163198
file_path: file_path.to_path_buf(),
@@ -167,7 +202,8 @@ impl CommonExtractor {
167202
chunk_type,
168203
name,
169204
comment_ranges: chunk_comment_ranges,
170-
original_indentation,
205+
// Store indentation as character count to keep extractor outputs character-based
206+
original_indentation: indent_offset_chars,
171207
})
172208
}
173209

@@ -207,17 +243,14 @@ impl CommonExtractor {
207243
None
208244
}
209245

210-
fn normalize_first_line_indentation(
211-
content: &str,
212-
original_indent_chars: &str,
213-
) -> String {
246+
fn normalize_first_line_indentation(content: &str, original_indent_chars: &str) -> String {
214247
let lines: Vec<&str> = content.lines().collect();
215248
if lines.is_empty() {
216249
return content.to_string();
217250
}
218251

219252
let mut result_lines = Vec::new();
220-
253+
221254
for (line_idx, line) in lines.iter().enumerate() {
222255
if line_idx == 0 {
223256
// First line: add original indentation characters from source
@@ -231,15 +264,22 @@ impl CommonExtractor {
231264
result_lines.join("\n")
232265
}
233266

234-
fn extract_line_indent_chars(
267+
pub fn extract_line_indent_chars_corrected(
235268
source_code: &str,
236269
line_row: usize,
237-
indent_length: usize,
270+
indent_byte_length: usize,
238271
) -> String {
239272
let lines: Vec<&str> = source_code.lines().collect();
240273
if line_row < lines.len() {
241274
let line = lines[line_row];
242-
line.chars().take(indent_length).collect()
275+
// Convert byte position to character position first
276+
if indent_byte_length <= line.len() {
277+
let indent_char_count = line[..indent_byte_length].chars().count();
278+
line.chars().take(indent_char_count).collect()
279+
} else {
280+
// If byte length exceeds line length, take all characters
281+
line.to_string()
282+
}
243283
} else {
244284
String::new()
245285
}

src/game/stage_renderer.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,8 @@ impl StageRenderer {
242242
lines.push(Line::from(line_spans));
243243
}
244244

245+
let mut byte_position = 0; // Track byte position as we iterate
246+
245247
for (i, &ch) in self.chars.iter().enumerate() {
246248
// Add line number at the start of each line
247249
if line_start {
@@ -266,14 +268,15 @@ impl StageRenderer {
266268
current_line_width = 0;
267269
line_number += 1;
268270
line_start = true;
271+
byte_position += ch.len_utf8(); // Update byte position
269272
continue;
270273
}
271274

272-
// Check if this character is in a comment
275+
// Check if this character is in a comment using byte position
273276
let is_in_comment = params
274277
.display_comment_ranges
275278
.iter()
276-
.any(|&(start, end)| i >= start && i < end);
279+
.any(|&(start, end)| byte_position >= start && byte_position < end);
277280

278281
// Determine character style
279282
let style = if is_in_comment {
@@ -319,6 +322,9 @@ impl StageRenderer {
319322

320323
current_line_spans.push(Span::styled(display_char, style));
321324
current_line_width += char_width;
325+
326+
// Update byte position for next iteration
327+
byte_position += ch.len_utf8();
322328
}
323329

324330
if !current_line_spans.is_empty() {

src/game/text_processor.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,10 @@ impl TextProcessor {
9191
};
9292

9393
let mapped_end = if end <= position_mapping.len() {
94-
// Find the last non-None position before end
9594
(0..end)
9695
.rev()
9796
.find_map(|i| position_mapping.get(i).and_then(|&pos| pos))
98-
.map(|pos| pos + 1) // +1 because end is exclusive
97+
.map(|pos| pos + 1)
9998
} else {
10099
None
101100
};

0 commit comments

Comments
 (0)