Skip to content

Commit 45dd114

Browse files
Merge pull request #179 from unhappychoice/refactor/remove-indentation-normalization
refactor: remove indentation normalization and fix related issues
2 parents b4d24b3 + 62ed39e commit 45dd114

File tree

42 files changed

+1772
-622
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1772
-622
lines changed

examples/debug_comment.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
use gittype::game::typing_core::{ProcessingOptions, TypingCore};
2+
3+
fn main() {
4+
let code = r#"fn test() {
5+
// Path symbols: ../.. and ./path and ~/home
6+
// Unicode arrows: → ← ↑ ↓ and ↵ symbol
7+
// Mixed symbols: ../../config.json → ~/.config/
8+
let x = 42;
9+
}"#;
10+
11+
let comment_ranges = vec![
12+
(
13+
code.find("// Path symbols").unwrap(),
14+
code.find("~/home").unwrap() + "~/home".len(),
15+
),
16+
(
17+
code.find("// Unicode arrows").unwrap(),
18+
code.find("↵ symbol").unwrap() + "↵ symbol".len(),
19+
),
20+
(
21+
code.find("// Mixed symbols").unwrap(),
22+
code.find("~/.config/").unwrap() + "~/.config/".len(),
23+
),
24+
];
25+
26+
println!(
27+
"code.len bytes={}, chars={}",
28+
code.len(),
29+
code.chars().count()
30+
);
31+
println!("comment_ranges (bytes): {:?}", comment_ranges);
32+
33+
let typing_core = TypingCore::new(code, &comment_ranges, ProcessingOptions::default());
34+
let display = typing_core.text_to_display().to_string();
35+
println!(
36+
"display bytes={}, chars={}",
37+
display.len(),
38+
display.chars().count()
39+
);
40+
println!("display: {}", display);
41+
let ranges = typing_core.display_comment_ranges();
42+
println!("display_ranges: {:?}", ranges);
43+
for (i, (s, e)) in ranges.iter().enumerate() {
44+
let frag = &display[*s..*e];
45+
println!("[{}] {:?}", i, frag);
46+
}
47+
48+
// Show how we convert bytes->chars for original ranges
49+
let to_char = |b: usize| code[..b.min(code.len())].chars().count();
50+
let converted: Vec<(usize, usize)> = comment_ranges
51+
.iter()
52+
.map(|&(s, e)| (to_char(s), to_char(e)))
53+
.collect();
54+
println!("converted char ranges: {:?}", converted);
55+
let code_chars: Vec<char> = code.chars().collect();
56+
for (i, (cs, ce)) in converted.iter().copied().enumerate() {
57+
let text: String = code_chars[cs..ce].iter().collect();
58+
println!("orig[{}]: {:?} (chars {})", i, text, text.chars().count());
59+
}
60+
61+
// Show line char starts
62+
let mut acc = 0usize;
63+
for (i, line) in code.lines().enumerate() {
64+
let start = acc;
65+
let end = start + line.chars().count();
66+
println!("line {} chars {}..{} => {:?}", i + 1, start, end, line);
67+
acc = end + 1; // account for \n
68+
}
69+
}

src/extractor/challenge_converter.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ impl ChallengeConverter {
190190
// Check if truncated content meets minimum requirements
191191
let adjusted_comment_ranges = self.adjust_comment_ranges_for_truncation(
192192
&chunk.comment_ranges,
193-
truncated_content.len(),
193+
truncated_content.chars().count(),
194194
);
195195
let truncated_code_chars =
196196
self.count_code_characters(truncated_content, &adjusted_comment_ranges);

src/extractor/core/extractor.rs

Lines changed: 71 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,14 @@ impl CommonExtractor {
7070
while let Some(m) = matches.next() {
7171
for capture in m.captures {
7272
let node = capture.node;
73-
let start = node.start_byte();
74-
let end = node.end_byte();
73+
let start_byte = node.start_byte();
74+
let end_byte = node.end_byte();
7575

7676
if Self::is_valid_comment_node(node, language) {
77-
comment_ranges.push((start, end));
77+
// Convert byte positions to character positions
78+
let start_char = Self::byte_to_char_position(source_code, start_byte);
79+
let end_char = Self::byte_to_char_position(source_code, end_byte);
80+
comment_ranges.push((start_char, end_char));
7881
}
7982
}
8083
}
@@ -83,6 +86,13 @@ impl CommonExtractor {
8386
Ok(comment_ranges)
8487
}
8588

89+
/// Convert byte position to character position in the given string
90+
fn byte_to_char_position(source_code: &str, byte_pos: usize) -> usize {
91+
source_code[..byte_pos.min(source_code.len())]
92+
.chars()
93+
.count()
94+
}
95+
8696
fn is_valid_comment_node(node: Node, language: &str) -> bool {
8797
let node_kind = node.kind();
8898
match language {
@@ -111,22 +121,27 @@ impl CommonExtractor {
111121
file_path: &Path,
112122
language: &str,
113123
capture_name: &str,
114-
file_comment_ranges: &[(usize, usize)],
124+
file_comment_ranges: &[(usize, usize)], // Already in character positions
115125
) -> Option<CodeChunk> {
116126
let start_byte = node.start_byte();
117127
let end_byte = node.end_byte();
118128
let content = &source_code[start_byte..end_byte];
119129

130+
// Convert byte positions to character positions to match file_comment_ranges
131+
let start_char = Self::byte_to_char_position(source_code, start_byte);
132+
let end_char = Self::byte_to_char_position(source_code, end_byte);
133+
120134
let start_line = node.start_position().row + 1;
121135
let end_line = node.end_position().row + 1;
122-
let original_indentation = node.start_position().column;
136+
let original_indentation_bytes = node.start_position().column;
123137

124138
// Extract actual indentation characters from source
125-
let original_indent_chars = if original_indentation > 0 {
126-
Self::extract_line_indent_chars(
139+
// Note: original_indentation is in byte units from TreeSitter, but we need char units
140+
let original_indent_chars = if original_indentation_bytes > 0 {
141+
Self::extract_line_indent_chars_corrected(
127142
source_code,
128143
node.start_position().row,
129-
original_indentation,
144+
original_indentation_bytes,
130145
)
131146
} else {
132147
String::new()
@@ -142,24 +157,42 @@ impl CommonExtractor {
142157
.or_else(|| Self::extract_name(node, source_code))
143158
.unwrap_or_else(|| "unknown".to_string());
144159

160+
let normalized_content =
161+
Self::normalize_first_line_indentation(content, &original_indent_chars);
162+
163+
// Simple position calculation:
164+
// code_start_pos = start_char (TreeSitter chunk の行頭)
165+
// chunk_start_pos = original_indentation (node.start_position().column)
166+
// comment_start_pos = comment生pos - code_start_pos
167+
168+
// Adjust comment ranges to be relative to the normalized content.
169+
// Note:
170+
// - file_comment_ranges are character-based positions for the whole file
171+
// - We first convert them to chunk-relative character positions
172+
// - Then we add the first-line indentation characters we injected at the very
173+
// beginning of the normalized content, so display-time positions match
174+
let indent_offset_chars = original_indent_chars.chars().count();
175+
145176
let chunk_comment_ranges: Vec<(usize, usize)> = file_comment_ranges
146177
.iter()
147-
.filter_map(|&(comment_start, comment_end)| {
148-
if comment_start >= start_byte && comment_end <= end_byte {
149-
Some((comment_start - start_byte, comment_end - start_byte))
178+
.filter_map(|&(comment_raw_pos_start, comment_raw_pos_end)| {
179+
// Check if comment is within this chunk's boundaries
180+
if comment_raw_pos_start >= start_char && comment_raw_pos_end <= end_char {
181+
// Convert to chunk-relative positions
182+
let comment_start_pos = comment_raw_pos_start - start_char;
183+
let comment_end_pos = comment_raw_pos_end - start_char;
184+
185+
// Account for added indentation at the very start of normalized content
186+
let adjusted_start = comment_start_pos + indent_offset_chars;
187+
let adjusted_end = comment_end_pos + indent_offset_chars;
188+
189+
Some((adjusted_start, adjusted_end))
150190
} else {
151191
None
152192
}
153193
})
154194
.collect();
155195

156-
let (normalized_content, normalized_comment_ranges) = Self::normalize_indentation(
157-
content,
158-
original_indentation,
159-
&original_indent_chars,
160-
&chunk_comment_ranges,
161-
);
162-
163196
Some(CodeChunk {
164197
content: normalized_content,
165198
file_path: file_path.to_path_buf(),
@@ -168,8 +201,9 @@ impl CommonExtractor {
168201
language: language.to_string(),
169202
chunk_type,
170203
name,
171-
comment_ranges: normalized_comment_ranges,
172-
original_indentation,
204+
comment_ranges: chunk_comment_ranges,
205+
// Store indentation as character count to keep extractor outputs character-based
206+
original_indentation: indent_offset_chars,
173207
})
174208
}
175209

@@ -209,111 +243,43 @@ impl CommonExtractor {
209243
None
210244
}
211245

212-
fn normalize_indentation(
213-
content: &str,
214-
original_indentation: usize,
215-
original_indent_chars: &str,
216-
comment_ranges: &[(usize, usize)],
217-
) -> (String, Vec<(usize, usize)>) {
246+
fn normalize_first_line_indentation(content: &str, original_indent_chars: &str) -> String {
218247
let lines: Vec<&str> = content.lines().collect();
219248
if lines.is_empty() {
220-
return (content.to_string(), comment_ranges.to_vec());
249+
return content.to_string();
221250
}
222251

223-
let mut position_map = Vec::new();
224-
let mut normalized_lines = Vec::new();
225-
let mut _original_pos = 0;
226-
let mut normalized_pos = 0;
252+
let mut result_lines = Vec::new();
227253

228254
for (line_idx, line) in lines.iter().enumerate() {
229-
let line_chars: Vec<char> = line.chars().collect();
230-
231255
if line_idx == 0 {
232256
// First line: add original indentation characters from source
233-
let normalized_line = format!("{}{}", original_indent_chars, line);
234-
235-
// Map positions: indent chars are not mapped to original content, original content is mapped
236-
for _ in original_indent_chars.chars() {
237-
position_map.push(Some(normalized_pos));
238-
normalized_pos += 1;
239-
}
240-
for _ in &line_chars {
241-
position_map.push(Some(normalized_pos));
242-
normalized_pos += 1;
243-
_original_pos += 1;
244-
}
245-
normalized_lines.push(normalized_line);
246-
} else if line.trim().is_empty() {
247-
for _ in &line_chars {
248-
position_map.push(None);
249-
_original_pos += 1;
250-
}
251-
normalized_lines.push(String::new());
257+
result_lines.push(format!("{}{}", original_indent_chars, line));
252258
} else {
253-
let current_indent = line.len() - line.trim_start().len();
254-
if current_indent >= original_indentation {
255-
for i in 0..line_chars.len() {
256-
if i < original_indentation {
257-
position_map.push(None);
258-
} else {
259-
position_map.push(Some(normalized_pos));
260-
normalized_pos += 1;
261-
}
262-
_original_pos += 1;
263-
}
264-
normalized_lines.push(line[original_indentation..].to_string());
265-
} else {
266-
for _ in &line_chars {
267-
position_map.push(Some(normalized_pos));
268-
normalized_pos += 1;
269-
_original_pos += 1;
270-
}
271-
normalized_lines.push(line.to_string());
272-
}
273-
}
274-
275-
if line_idx < lines.len() - 1 {
276-
position_map.push(Some(normalized_pos));
277-
normalized_pos += 1;
278-
_original_pos += 1;
279-
}
280-
}
281-
282-
let normalized_text = normalized_lines.join("\n");
283-
let mut final_ranges = Vec::new();
284-
285-
for &(orig_start, orig_end) in comment_ranges {
286-
if orig_start < position_map.len() && orig_end <= position_map.len() {
287-
let norm_start = position_map.get(orig_start).and_then(|&pos| pos);
288-
let norm_end = if orig_end > 0 && orig_end <= position_map.len() {
289-
(0..orig_end)
290-
.rev()
291-
.find_map(|i| position_map.get(i).and_then(|&pos| pos))
292-
.map(|pos| pos + 1)
293-
} else {
294-
None
295-
};
296-
297-
if let (Some(start), Some(end)) = (norm_start, norm_end) {
298-
if start < end && end <= normalized_text.len() {
299-
final_ranges.push((start, end));
300-
}
301-
}
259+
// Other lines: keep as is
260+
result_lines.push(line.to_string());
302261
}
303262
}
304263

305-
(normalized_text, final_ranges)
264+
result_lines.join("\n")
306265
}
307266

308-
fn extract_line_indent_chars(
267+
pub fn extract_line_indent_chars_corrected(
309268
source_code: &str,
310269
line_row: usize,
311-
indent_length: usize,
270+
indent_byte_length: usize,
312271
) -> String {
313272
let lines: Vec<&str> = source_code.lines().collect();
314273
if line_row < lines.len() {
315274
let line = lines[line_row];
316-
line.chars().take(indent_length).collect()
275+
// Convert byte position to character position first
276+
if indent_byte_length <= line.len() {
277+
let indent_char_count = line[..indent_byte_length].chars().count();
278+
line.chars().take(indent_char_count).collect()
279+
} else {
280+
// If byte length exceeds line length, take all characters
281+
line.to_string()
282+
}
317283
} else {
318284
String::new()
319285
}

0 commit comments

Comments
 (0)