Skip to content

Commit a2fe4a6

Browse files
unhappychoiceclaude
andcommitted
fix: correct comment range detection in middle chunk extraction
Fix bug where middle chunk comment ranges were not properly detected due to incorrect byte-to-char cache usage. The extract_comment_ranges function was being called with an empty cache array instead of the properly constructed chunk_byte_to_char_cache, causing comment positions to be calculated incorrectly. This fix ensures comments within middle chunks are properly recognized and skipped during typing practice. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 3ed12d2 commit a2fe4a6

File tree

17 files changed

+744
-93
lines changed

17 files changed

+744
-93
lines changed

src/extractor/core/extractor.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,13 @@ impl CommonExtractor {
122122
large_chunk.content.as_bytes(),
123123
);
124124

125-
let chunk_comment_ranges =
126-
Self::extract_comment_ranges(&chunk_tree, &large_chunk.content, language, &[])?;
127125
let chunk_byte_to_char_cache = Self::build_byte_to_char_cache(&large_chunk.content);
126+
let chunk_comment_ranges = Self::extract_comment_ranges(
127+
&chunk_tree,
128+
&large_chunk.content,
129+
language,
130+
&chunk_byte_to_char_cache,
131+
)?;
128132

129133
while let Some(match_) = chunk_matches.next() {
130134
for capture in match_.captures {

tests/integration/languages/snapshots/r#mod__integration__languages__extractor__test_c_complex_algorithm_extraction.snap

Lines changed: 78 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,20 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
170170
},
171171
{
172172
"chunk_type": "Loop",
173-
"comment_ranges": [],
173+
"comment_ranges": [
174+
[
175+
435,
176+
475
177+
],
178+
[
179+
547,
180+
555
181+
],
182+
[
183+
727,
184+
750
185+
]
186+
],
174187
"content": " for (size_t i = 0; i < input_size; i++) {\n int value = input[i];\n ProcessedItem item;\n item.id = (int)i;\n\n if (value > threshold) {\n int transformed = value * 2;\n item.value = transformed;\n\n if (transformed > threshold * 3) {\n strcpy(item.category, \"HIGH\");\n } else {\n strcpy(item.category, \"MEDIUM\");\n }\n\n // Additional processing for high values\n if (transformed > 100) {\n item.value += 10; // bonus\n }\n } else if (value > 0) {\n item.value = value + threshold;\n strcpy(item.category, \"LOW\");\n } else {\n continue; // skip negative values\n }\n\n results[result_count++] = item;\n }",
175188
"end_line": 54,
176189
"language": "c",
@@ -190,7 +203,16 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
190203
},
191204
{
192205
"chunk_type": "CodeBlock",
193-
"comment_ranges": [],
206+
"comment_ranges": [
207+
[
208+
304,
209+
344
210+
],
211+
[
212+
416,
213+
424
214+
]
215+
],
194216
"content": " if (value > threshold) {\n int transformed = value * 2;\n item.value = transformed;\n\n if (transformed > threshold * 3) {\n strcpy(item.category, \"HIGH\");\n } else {\n strcpy(item.category, \"MEDIUM\");\n }\n\n // Additional processing for high values\n if (transformed > 100) {\n item.value += 10; // bonus\n }\n }",
195217
"end_line": 46,
196218
"language": "c",
@@ -200,7 +222,20 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
200222
},
201223
{
202224
"chunk_type": "Conditional",
203-
"comment_ranges": [],
225+
"comment_ranges": [
226+
[
227+
304,
228+
344
229+
],
230+
[
231+
416,
232+
424
233+
],
234+
[
235+
596,
236+
619
237+
]
238+
],
204239
"content": " if (value > threshold) {\n int transformed = value * 2;\n item.value = transformed;\n\n if (transformed > threshold * 3) {\n strcpy(item.category, \"HIGH\");\n } else {\n strcpy(item.category, \"MEDIUM\");\n }\n\n // Additional processing for high values\n if (transformed > 100) {\n item.value += 10; // bonus\n }\n } else if (value > 0) {\n item.value = value + threshold;\n strcpy(item.category, \"LOW\");\n } else {\n continue; // skip negative values\n }",
205240
"end_line": 51,
206241
"language": "c",
@@ -250,7 +285,12 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
250285
},
251286
{
252287
"chunk_type": "Conditional",
253-
"comment_ranges": [],
288+
"comment_ranges": [
289+
[
290+
71,
291+
79
292+
]
293+
],
254294
"content": " if (transformed > 100) {\n item.value += 10; // bonus\n }",
255295
"end_line": 45,
256296
"language": "c",
@@ -270,7 +310,12 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
270310
},
271311
{
272312
"chunk_type": "Conditional",
273-
"comment_ranges": [],
313+
"comment_ranges": [
314+
[
315+
157,
316+
180
317+
]
318+
],
274319
"content": " } else if (value > 0) {\n item.value = value + threshold;\n strcpy(item.category, \"LOW\");\n } else {\n continue; // skip negative values\n }",
275320
"end_line": 51,
276321
"language": "c",
@@ -280,7 +325,12 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
280325
},
281326
{
282327
"chunk_type": "CodeBlock",
283-
"comment_ranges": [],
328+
"comment_ranges": [
329+
[
330+
39,
331+
62
332+
]
333+
],
284334
"content": " } else {\n continue; // skip negative values\n }",
285335
"end_line": 51,
286336
"language": "c",
@@ -290,7 +340,16 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
290340
},
291341
{
292342
"chunk_type": "Conditional",
293-
"comment_ranges": [],
343+
"comment_ranges": [
344+
[
345+
36,
346+
71
347+
],
348+
[
349+
254,
350+
302
351+
]
352+
],
294353
"content": " if (result_count > 0) {\n // Calculate average for validation\n int total = 0;\n for (size_t i = 0; i < result_count; i++) {\n total += results[i].value;\n }\n int average = total / (int)result_count;\n\n // Add average as metadata (simplified approach)\n printf(\"Average processed value: %d\\n\", average);\n }",
295354
"end_line": 67,
296355
"language": "c",
@@ -387,7 +446,12 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
387446
},
388447
{
389448
"chunk_type": "Loop",
390-
"comment_ranges": [],
449+
"comment_ranges": [
450+
[
451+
541,
452+
574
453+
]
454+
],
391455
"content": " for (size_t i = 0; i < count; i++) {\n ProcessedItem *item = &items[i];\n int category_index = -1;\n\n if (strcmp(item->category, \"LOW\") == 0) {\n category_index = 0;\n } else if (strcmp(item->category, \"MEDIUM\") == 0) {\n category_index = 1;\n } else if (strcmp(item->category, \"HIGH\") == 0) {\n category_index = 2;\n }\n\n if (category_index >= 0) {\n category_counts[category_index]++;\n value_sums[category_index] += item->value;\n\n // Time-based analysis simulation\n if (item->value > 1000) {\n printf(\"High value item found: %d\\n\", item->value);\n }\n }\n }",
392456
"end_line": 102,
393457
"language": "c",
@@ -457,7 +521,12 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
457521
},
458522
{
459523
"chunk_type": "Conditional",
460-
"comment_ranges": [],
524+
"comment_ranges": [
525+
[
526+
150,
527+
183
528+
]
529+
],
461530
"content": " if (category_index >= 0) {\n category_counts[category_index]++;\n value_sums[category_index] += item->value;\n\n // Time-based analysis simulation\n if (item->value > 1000) {\n printf(\"High value item found: %d\\n\", item->value);\n }\n }",
462531
"end_line": 101,
463532
"language": "c",

tests/integration/languages/snapshots/r#mod__integration__languages__extractor__test_cpp_complex_algorithm_extraction.snap

Lines changed: 88 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,20 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
252252
},
253253
{
254254
"chunk_type": "Loop",
255-
"comment_ranges": [],
255+
"comment_ranges": [
256+
[
257+
542,
258+
582
259+
],
260+
[
261+
713,
262+
721
263+
],
264+
[
265+
908,
266+
931
267+
]
268+
],
256269
"content": " for (size_t i = 0; i < input.size(); ++i) {\n const T& value = input[i];\n std::string cache_key = \"item_\" + std::to_string(i);\n\n auto cache_it = cache.find(cache_key);\n if (cache_it != cache.end()) {\n results.push_back(cache_it->second);\n continue;\n }\n\n T processed_value;\n if (value > static_cast<T>(threshold)) {\n processed_value = value * static_cast<T>(2);\n processed_count++;\n\n // Additional processing for high values\n if (processed_value > static_cast<T>(threshold * 3)) {\n processed_value += static_cast<T>(10); // bonus\n }\n } else if (value > static_cast<T>(0)) {\n processed_value = value + static_cast<T>(threshold);\n } else {\n continue; // skip negative values\n }\n\n cache[cache_key] = processed_value;\n processing_log.push_back(processed_value);\n results.push_back(processed_value);\n }",
257270
"end_line": 52,
258271
"language": "cpp",
@@ -292,7 +305,16 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
292305
},
293306
{
294307
"chunk_type": "CodeBlock",
295-
"comment_ranges": [],
308+
"comment_ranges": [
309+
[
310+
166,
311+
206
312+
],
313+
[
314+
337,
315+
345
316+
]
317+
],
296318
"content": " if (value > static_cast<T>(threshold)) {\n processed_value = value * static_cast<T>(2);\n processed_count++;\n\n // Additional processing for high values\n if (processed_value > static_cast<T>(threshold * 3)) {\n processed_value += static_cast<T>(10); // bonus\n }\n }",
297319
"end_line": 43,
298320
"language": "cpp",
@@ -302,7 +324,20 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
302324
},
303325
{
304326
"chunk_type": "Conditional",
305-
"comment_ranges": [],
327+
"comment_ranges": [
328+
[
329+
166,
330+
206
331+
],
332+
[
333+
337,
334+
345
335+
],
336+
[
337+
532,
338+
555
339+
]
340+
],
306341
"content": " if (value > static_cast<T>(threshold)) {\n processed_value = value * static_cast<T>(2);\n processed_count++;\n\n // Additional processing for high values\n if (processed_value > static_cast<T>(threshold * 3)) {\n processed_value += static_cast<T>(10); // bonus\n }\n } else if (value > static_cast<T>(0)) {\n processed_value = value + static_cast<T>(threshold);\n } else {\n continue; // skip negative values\n }",
307342
"end_line": 47,
308343
"language": "cpp",
@@ -312,7 +347,12 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
312347
},
313348
{
314349
"chunk_type": "Conditional",
315-
"comment_ranges": [],
350+
"comment_ranges": [
351+
[
352+
130,
353+
138
354+
]
355+
],
316356
"content": " if (processed_value > static_cast<T>(threshold * 3)) {\n processed_value += static_cast<T>(10); // bonus\n }",
317357
"end_line": 42,
318358
"language": "cpp",
@@ -332,7 +372,12 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
332372
},
333373
{
334374
"chunk_type": "Conditional",
335-
"comment_ranges": [],
375+
"comment_ranges": [
376+
[
377+
168,
378+
191
379+
]
380+
],
336381
"content": " } else if (value > static_cast<T>(0)) {\n processed_value = value + static_cast<T>(threshold);\n } else {\n continue; // skip negative values\n }",
337382
"end_line": 47,
338383
"language": "cpp",
@@ -342,7 +387,12 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
342387
},
343388
{
344389
"chunk_type": "CodeBlock",
345-
"comment_ranges": [],
390+
"comment_ranges": [
391+
[
392+
47,
393+
70
394+
]
395+
],
346396
"content": " } else {\n continue; // skip negative values\n }",
347397
"end_line": 47,
348398
"language": "cpp",
@@ -352,7 +402,12 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
352402
},
353403
{
354404
"chunk_type": "Conditional",
355-
"comment_ranges": [],
405+
"comment_ranges": [
406+
[
407+
202,
408+
236
409+
]
410+
],
356411
"content": " if (processed_count > 0) {\n T total = std::accumulate(results.begin(), results.end(), static_cast<T>(0));\n T average = total / static_cast<T>(results.size());\n\n // Add average to log for analysis\n processing_log.push_back(average);\n }",
357412
"end_line": 61,
358413
"language": "cpp",
@@ -415,7 +470,12 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
415470
},
416471
{
417472
"chunk_type": "CodeBlock",
418-
"comment_ranges": [],
473+
"comment_ranges": [
474+
[
475+
395,
476+
426
477+
]
478+
],
419479
"content": " for (const auto& item : data) {\n std::string category;\n\n if (item > static_cast<T>(threshold * 2)) {\n category = \"HIGH\";\n } else if (item > static_cast<T>(threshold)) {\n category = \"MEDIUM\";\n } else {\n category = \"LOW\";\n }\n\n categories[category].push_back(item);\n\n // Additional pattern detection\n if (item > static_cast<T>(1000)) {\n categories[\"PREMIUM\"].push_back(item);\n }\n }",
420480
"end_line": 88,
421481
"language": "cpp",
@@ -589,7 +649,20 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
589649
},
590650
{
591651
"chunk_type": "CodeBlock",
592-
"comment_ranges": [],
652+
"comment_ranges": [
653+
[
654+
96,
655+
134
656+
],
657+
[
658+
259,
659+
300
660+
],
661+
[
662+
599,
663+
633
664+
]
665+
],
593666
"content": " for (const auto& text : input) {\n std::string processed = text;\n\n // Pattern matching and transformation\n size_t pos = 0;\n while ((pos = processed.find(pattern, pos)) != std::string::npos) {\n // Replace pattern with uppercase version\n std::string replacement = pattern;\n std::transform(replacement.begin(), replacement.end(), replacement.begin(), ::toupper);\n processed.replace(pos, pattern.length(), replacement);\n pos += replacement.length();\n }\n\n // Additional text transformations\n if (processed.length() > 50) {\n processed = processed.substr(0, 47) + \"...\";\n }\n\n if (!processed.empty()) {\n results.push_back(processed);\n }\n }",
594667
"end_line": 133,
595668
"language": "cpp",
@@ -619,7 +692,12 @@ expression: "serde_json::to_string_pretty(&snapshot_data).unwrap()"
619692
},
620693
{
621694
"chunk_type": "Loop",
622-
"comment_ranges": [],
695+
"comment_ranges": [
696+
[
697+
96,
698+
137
699+
]
700+
],
623701
"content": " while ((pos = processed.find(pattern, pos)) != std::string::npos) {\n // Replace pattern with uppercase version\n std::string replacement = pattern;\n std::transform(replacement.begin(), replacement.end(), replacement.begin(), ::toupper);\n processed.replace(pos, pattern.length(), replacement);\n pos += replacement.length();\n }",
624702
"end_line": 123,
625703
"language": "cpp",

0 commit comments

Comments
 (0)