Skip to content

Commit 5e2de87

Browse files
authored
Merge pull request #162 from greatroar/arm64-literal-copy
internal/lz4block: Short literal copying in arm64 decoder
2 parents 6bd757c + e99166d commit 5e2de87

File tree

1 file changed

+42
-30
lines changed

1 file changed

+42
-30
lines changed

internal/lz4block/decode_arm64.s

Lines changed: 42 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,25 @@
88
#include "textflag.h"
99

1010
// Register allocation.
11-
#define dst R0
12-
#define dstorig R1
13-
#define src R2
14-
#define dstend R3
15-
#define srcend R4
16-
#define match R5 // Match address.
17-
#define dict R6
18-
#define dictlen R7
19-
#define dictend R8
20-
#define token R9
21-
#define len R10 // Literal and match lengths.
22-
#define lenRem R11
23-
#define offset R12 // Match offset.
24-
#define tmp1 R13
25-
#define tmp2 R14
26-
#define tmp3 R15
27-
#define tmp4 R16
11+
#define dst R0
12+
#define dstorig R1
13+
#define src R2
14+
#define dstend R3
15+
#define dstend16 R4 // dstend - 16
16+
#define srcend R5
17+
#define srcend16 R6 // srcend - 16
18+
#define match R7 // Match address.
19+
#define dict R8
20+
#define dictlen R9
21+
#define dictend R10
22+
#define token R11
23+
#define len R12 // Literal and match lengths.
24+
#define lenRem R13
25+
#define offset R14 // Match offset.
26+
#define tmp1 R15
27+
#define tmp2 R16
28+
#define tmp3 R17
29+
#define tmp4 R19
2830

2931
// func decodeBlock(dst, src, dict []byte) int
3032
TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $0-80
@@ -36,6 +38,12 @@ TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $0-80
3638
CBZ srcend, shortSrc
3739
ADD src, srcend
3840

41+
// dstend16 = max(dstend-16, 0) and similarly for srcend16.
42+
SUBS $16, dstend, dstend16
43+
CSEL LO, ZR, dstend16, dstend16
44+
SUBS $16, srcend, srcend16
45+
CSEL LO, ZR, srcend16, srcend16
46+
3947
LDP dict_base+48(FP), (dict, dictlen)
4048
ADD dict, dictlen, dictend
4149

@@ -71,27 +79,31 @@ readLitlenDone:
7179
// Copy literal.
7280
SUBS $16, len
7381
BLO copyLiteralShort
74-
AND $15, len, lenRem
7582

7683
copyLiteralLoop:
77-
SUBS $16, len
7884
LDP.P 16(src), (tmp1, tmp2)
7985
STP.P (tmp1, tmp2), 16(dst)
86+
SUBS $16, len
8087
BPL copyLiteralLoop
8188

82-
// lenRem = len%16 is the remaining number of bytes we need to copy.
83-
// Since len was >= 16, we can do this in one load and one store,
84-
// overlapping with the last load and store, without worrying about
85-
// writing out of bounds.
86-
ADD lenRem, src
87-
ADD lenRem, dst
88-
LDP -16(src), (tmp1, tmp2)
89-
STP (tmp1, tmp2), -16(dst)
89+
// Copy (final part of) literal of length 0-15.
90+
// If we have >=16 bytes left in src and dst, just copy 16 bytes.
91+
copyLiteralShort:
92+
CMP dstend16, dst
93+
CCMP LO, src, srcend16, $0b0010 // 0010 = preserve carry (LO).
94+
BHS copyLiteralShortEnd
95+
96+
AND $15, len
97+
98+
LDP (src), (tmp1, tmp2)
99+
ADD len, src
100+
STP (tmp1, tmp2), (dst)
101+
ADD len, dst
90102

91103
B copyLiteralDone
92104

93-
// Copy literal of length 0-15.
94-
copyLiteralShort:
105+
// Safe but slow copy near the end of src, dst.
106+
copyLiteralShortEnd:
95107
TBZ $3, len, 3(PC)
96108
MOVD.P 8(src), tmp1
97109
MOVD.P tmp1, 8(dst)
@@ -159,7 +171,7 @@ copyDict:
159171
CCMP NE, dictend, match, $0b0100 // 0100 sets the Z (EQ) flag.
160172
BNE copyDict
161173

162-
CBZ len, copyMatchDone
174+
CBZ len, copyMatchDone
163175

164176
// If the match extends beyond the dictionary, the rest is at dstorig.
165177
MOVD dstorig, match

0 commit comments

Comments
 (0)