-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathpbm.go
More file actions
112 lines (95 loc) · 2.5 KB
/
pbm.go
File metadata and controls
112 lines (95 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
package main
import (
"bytes"
"fmt"
"os"
"path/filepath"
"slices"
"strings"
"github.com/mfonda/simhash"
)
func readAndHashFile(path string) (*duplicateFile, error) {
content, err := os.ReadFile(path)
if err != nil {
return nil, err
}
if len(content) > int(maxReadSizeBytes) {
content = content[:maxReadSizeBytes]
}
// Binary check
check := content
if len(check) > 10_000 {
check = content[:10_000]
}
if bytes.ContainsRune(check, 0) {
return nil, fmt.Errorf("file appears to be binary: %s", path)
}
ext := filepath.Ext(path)
if ext != "" {
ext = ext[1:] // strip leading dot
}
lines := strings.Split(string(content), "\n")
lineHashes := make([]uint64, 0, len(lines))
for _, line := range lines {
clean := strings.ToLower(spaceMap(line))
hash := simhash.Simhash(simhash.NewWordFeatureSet([]byte(clean)))
lineHashes = append(lineHashes, hash)
}
sortedUnique := make([]uint64, len(lineHashes))
copy(sortedUnique, lineHashes)
slices.Sort(sortedUnique)
sortedUnique = slices.Compact(sortedUnique)
return &duplicateFile{
ID: 0,
Location: path,
Extension: ext,
LineHashes: lineHashes,
SortedUniqueHashes: sortedUnique,
}, nil
}
func processPBM() {
fileA, err := readAndHashFile(pbmFileA)
if err != nil {
fmt.Fprintf(os.Stderr, "error reading %s: %s\n", pbmFileA, err)
os.Exit(1)
}
fileB, err := readAndHashFile(pbmFileB)
if err != nil {
fmt.Fprintf(os.Stderr, "error reading %s: %s\n", pbmFileB, err)
os.Exit(1)
}
fileB.ID = 1
// Always pass sameFile=false for PBM: we want to visualize all matches
// including the main diagonal when comparing a file to itself.
matrix := identifyDuplicates(*fileA, *fileB, false, fuzzValue)
if err := writePBM(matrix, pbmOutput); err != nil {
fmt.Fprintf(os.Stderr, "error writing PBM: %s\n", err)
os.Exit(1)
}
fmt.Printf("PBM scatter plot written to %s (%d x %d)\n", pbmOutput, len(matrix[0]), len(matrix))
}
func writePBM(matrix [][]bool, outputPath string) error {
if len(matrix) == 0 {
return fmt.Errorf("empty matrix")
}
height := len(matrix)
width := len(matrix[0])
var sb strings.Builder
sb.WriteString("P1\n")
sb.WriteString(fmt.Sprintf("# dcd scatter plot\n"))
sb.WriteString(fmt.Sprintf("%d %d\n", width, height))
for _, row := range matrix {
for j, val := range row {
if j > 0 {
sb.WriteByte(' ')
}
if val {
sb.WriteByte('1')
} else {
sb.WriteByte('0')
}
}
sb.WriteByte('\n')
}
return os.WriteFile(outputPath, []byte(sb.String()), 0644)
}