Skip to content

Commit c9512c0

Browse files
authored
Reduce string allocations (#687)
* reduce string allocations * Pre-parse remap rules per run * fix logging in unknownRemapLanguage * microbenchmark for DetermineLanguage * microbenchmarks for remapping functions
1 parent cc8311e commit c9512c0

7 files changed

Lines changed: 228 additions & 38 deletions

File tree

processor/detector.go

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
package processor
44

55
import (
6+
"bytes"
67
"cmp"
78
"errors"
89
"slices"
@@ -166,11 +167,9 @@ func DetermineLanguage(filename string, fallbackLanguage string, possibleLanguag
166167

167168
startTime := makeTimestampNano()
168169

169-
var toCheck string
170+
toCheck := content
170171
if len(content) > 20_000 {
171-
toCheck = string(content)[:20_000]
172-
} else {
173-
toCheck = string(content)
172+
toCheck = content[:20_000]
174173
}
175174

176175
primary := ""
@@ -182,8 +181,8 @@ func DetermineLanguage(filename string, fallbackLanguage string, possibleLanguag
182181
LanguageFeaturesMutex.Unlock()
183182

184183
count := 0
185-
for _, key := range langFeatures.Keywords {
186-
if strings.Contains(toCheck, key) {
184+
for _, key := range langFeatures.KeywordBytes {
185+
if bytes.Contains(toCheck, key) {
187186
count++
188187
}
189188
}

processor/detector_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
package processor
44

55
import (
6+
"strings"
67
"testing"
78
)
89

@@ -418,3 +419,34 @@ func BenchmarkScanSheBangReal(b *testing.B) {
418419
_, _ = scanForSheBang([]byte("#! /usr/bin/env perl -w"))
419420
}
420421
}
422+
423+
func BenchmarkDetermineLanguage(b *testing.B) {
424+
ProcessConstants()
425+
426+
coqContent := []byte("Require Hypothesis Inductive\n")
427+
systemVerilogContent := []byte("endmodule posedge edge always wire\n")
428+
largeCoqContent := []byte("Require Hypothesis Inductive\n" + strings.Repeat("x", 25_000))
429+
largeSystemVerilogContent := []byte("endmodule posedge edge always wire\n" + strings.Repeat("y", 25_000))
430+
possibleLanguages := []string{"Coq", "SystemVerilog"}
431+
432+
benchmarks := []struct {
433+
name string
434+
content []byte
435+
}{
436+
{name: "small_coq", content: coqContent},
437+
{name: "small_systemverilog", content: systemVerilogContent},
438+
{name: "large_coq_over_cutoff", content: largeCoqContent},
439+
{name: "large_systemverilog_over_cutoff", content: largeSystemVerilogContent},
440+
}
441+
442+
for _, benchmark := range benchmarks {
443+
b.Run(benchmark.name, func(b *testing.B) {
444+
b.ReportAllocs()
445+
b.SetBytes(int64(len(benchmark.content)))
446+
447+
for i := 0; i < b.N; i++ {
448+
_ = DetermineLanguage("", "", possibleLanguages, benchmark.content)
449+
}
450+
})
451+
}
452+
}

processor/processor.go

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,43 @@ var RemapUnknown = ""
145145
// RemapAll allows remapping of all files with a string to search the content for
146146
var RemapAll = ""
147147

148+
type remapRule struct {
149+
pattern []byte
150+
language string
151+
}
152+
153+
type remapConfig struct {
154+
all []remapRule
155+
unknown []remapRule
156+
}
157+
158+
type processorContext struct {
159+
remap remapConfig
160+
}
161+
162+
func parseRemapRules(value string) []remapRule {
163+
rules := []remapRule{}
164+
165+
for s := range strings.SplitSeq(value, ",") {
166+
t := strings.Split(s, ":")
167+
if len(t) == 2 {
168+
rules = append(rules, remapRule{
169+
pattern: []byte(t[0]),
170+
language: t[1],
171+
})
172+
}
173+
}
174+
175+
return rules
176+
}
177+
178+
func newRemapConfig(remapAll string, remapUnknown string) remapConfig {
179+
return remapConfig{
180+
all: parseRemapRules(remapAll),
181+
unknown: parseRemapRules(remapUnknown),
182+
}
183+
}
184+
148185
// CurrencySymbol allows setting the currency symbol for cocomo project cost estimation
149186
var CurrencySymbol = ""
150187

@@ -425,6 +462,7 @@ func processLanguageFeature(name string, value Language) {
425462
mlCommentTrie := &Trie{}
426463
stringTrie := &Trie{}
427464
tokenTrie := &Trie{}
465+
keywordBytes := make([][]byte, 0, len(value.Keywords))
428466

429467
complexityMask := byte(0)
430468
singleLineCommentMask := byte(0)
@@ -464,6 +502,10 @@ func processLanguageFeature(name string, value Language) {
464502
}
465503
processMask |= stringMask
466504

505+
for _, v := range value.Keywords {
506+
keywordBytes = append(keywordBytes, []byte(v))
507+
}
508+
467509
LanguageFeaturesMutex.Lock()
468510
LanguageFeatures[name] = LanguageFeature{
469511
Complexity: complexityTrie,
@@ -480,6 +522,7 @@ func processLanguageFeature(name string, value Language) {
480522
StringCheckMask: stringMask,
481523
ProcessMask: processMask,
482524
Keywords: value.Keywords,
525+
KeywordBytes: keywordBytes,
483526
Quotes: value.Quotes,
484527
}
485528
LanguageFeaturesMutex.Unlock()
@@ -608,6 +651,7 @@ func Process() {
608651
}
609652

610653
SortBy = strings.ToLower(SortBy)
654+
ctx := processorContext{remap: newRemapConfig(RemapAll, RemapUnknown)}
611655

612656
printDebugF("NumCPU: %d", runtime.NumCPU())
613657
printDebugF("SortBy: %s", SortBy)
@@ -692,7 +736,7 @@ func Process() {
692736
close(fileListQueue)
693737
}()
694738

695-
go fileProcessorWorker(fileListQueue, fileSummaryJobQueue)
739+
go ctx.fileProcessorWorker(fileListQueue, fileSummaryJobQueue)
696740

697741
result := fileSummarize(fileSummaryJobQueue)
698742
if FileOutput == "" {

processor/result.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ func ProcessResult() ([]LanguageSummary, error) {
4242
}
4343

4444
SortBy = strings.ToLower(SortBy)
45+
ctx := processorContext{remap: newRemapConfig(RemapAll, RemapUnknown)}
4546

4647
printDebugF("NumCPU: %d", runtime.NumCPU())
4748
printDebugF("SortBy: %s", SortBy)
@@ -126,7 +127,7 @@ func ProcessResult() ([]LanguageSummary, error) {
126127
close(fileListQueue)
127128
}()
128129

129-
go fileProcessorWorker(fileListQueue, fileSummaryJobQueue)
130+
go ctx.fileProcessorWorker(fileListQueue, fileSummaryJobQueue)
130131

131132
language := aggregateLanguageSummary(fileSummaryJobQueue)
132133
language = sortLanguageSummary(language)

processor/structs.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ type LanguageFeature struct {
6565
StringCheckMask byte
6666
ProcessMask byte
6767
Keywords []string
68+
KeywordBytes [][]byte
6869
Quotes []Quote
6970
}
7071

processor/workers.go

Lines changed: 23 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -688,7 +688,7 @@ func checkBomSkip(fileJob *FileJob) int {
688688

689689
// Reads and processes files from input chan in parallel, and sends results to
690690
// output chan
691-
func fileProcessorWorker(input chan *FileJob, output chan *FileJob) {
691+
func (ctx processorContext) fileProcessorWorker(input chan *FileJob, output chan *FileJob) {
692692
var startTime int64
693693
var fileCount int64
694694
var gcEnabled int64
@@ -721,7 +721,7 @@ func fileProcessorWorker(input chan *FileJob, output chan *FileJob) {
721721

722722
if err == nil {
723723
job.Content = content
724-
if processFile(job) {
724+
if ctx.processFile(job) {
725725
output <- job
726726
}
727727
} else {
@@ -743,7 +743,7 @@ func fileProcessorWorker(input chan *FileJob, output chan *FileJob) {
743743

744744
// Process a single file
745745
// File must have been read to job.Content already
746-
func processFile(job *FileJob) bool {
746+
func (ctx processorContext) processFile(job *FileJob) bool {
747747
fileStartTime := makeTimestampNano()
748748

749749
contents := job.Content
@@ -752,14 +752,14 @@ func processFile(job *FileJob) bool {
752752
job.Language = DetermineLanguage(job.Filename, job.Language, job.PossibleLanguages, job.Content)
753753

754754
remapped := false
755-
if RemapAll != "" {
756-
hardRemapLanguage(job)
755+
if len(ctx.remap.all) != 0 {
756+
ctx.hardRemapLanguage(job)
757757
}
758758

759759
// If the type is #! we should check to see if we can identify
760760
if job.Language == SheBang {
761-
if RemapUnknown != "" {
762-
remapped = unknownRemapLanguage(job)
761+
if len(ctx.remap.unknown) != 0 {
762+
remapped = ctx.unknownRemapLanguage(job)
763763
}
764764

765765
// if we didn't remap we then want to see if it's a #! map
@@ -835,36 +835,30 @@ func processFile(job *FileJob) bool {
835835
return true
836836
}
837837

838-
func hardRemapLanguage(job *FileJob) bool {
838+
func (ctx processorContext) hardRemapLanguage(job *FileJob) bool {
839839
remapped := false
840-
for s := range strings.SplitSeq(RemapAll, ",") {
841-
t := strings.Split(s, ":")
842-
if len(t) == 2 {
843-
cutoff := min(1000, len(job.Content)) // at most 1000 bytes into the file to look
844-
845-
if strings.Contains(string(job.Content[:cutoff]), t[0]) {
846-
job.Language = t[1]
847-
remapped = true
848-
printWarnF("hard remapping: %s to %s", job.Location, job.Language)
849-
}
840+
cutoff := min(1000, len(job.Content)) // at most 1000 bytes into the file to look
841+
842+
for _, rule := range ctx.remap.all {
843+
if bytes.Contains(job.Content[:cutoff], rule.pattern) {
844+
job.Language = rule.language
845+
remapped = true
846+
printWarnF("hard remapping: %s to %s", job.Location, job.Language)
850847
}
851848
}
852849

853850
return remapped
854851
}
855852

856-
func unknownRemapLanguage(job *FileJob) bool {
853+
func (ctx processorContext) unknownRemapLanguage(job *FileJob) bool {
857854
remapped := false
858-
for s := range strings.SplitSeq(RemapUnknown, ",") {
859-
t := strings.Split(s, ":")
860-
if len(t) == 2 {
861-
cutoff := min(1000, len(job.Content)) // at most 1000 bytes into the file to look
862-
863-
if strings.Contains(string(job.Content[:cutoff]), t[0]) {
864-
printWarnF("unknown remapping: %s to %s", job.Location, job.Language)
865-
job.Language = t[1]
866-
remapped = true
867-
}
855+
cutoff := min(1000, len(job.Content)) // at most 1000 bytes into the file to look
856+
857+
for _, rule := range ctx.remap.unknown {
858+
if bytes.Contains(job.Content[:cutoff], rule.pattern) {
859+
job.Language = rule.language
860+
remapped = true
861+
printWarnF("unknown remapping: %s to %s", job.Location, job.Language)
868862
}
869863
}
870864

0 commit comments

Comments
 (0)