]> asedeno.scripts.mit.edu Git - linux.git/blob - arch/powerpc/lib/memcpy_64.S
Merge tag 'powerpc-4.17-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
[linux.git] / arch / powerpc / lib / memcpy_64.S
1 /*
2  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  */
9 #include <asm/processor.h>
10 #include <asm/ppc_asm.h>
11 #include <asm/export.h>
12
13         .align  7
14 _GLOBAL_TOC(memcpy)
15 BEGIN_FTR_SECTION
16 #ifdef __LITTLE_ENDIAN__
17         cmpdi   cr7,r5,0
18 #else
19         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* save destination pointer for return value */
20 #endif
21 FTR_SECTION_ELSE
22 #ifdef CONFIG_PPC_BOOK3S_64
23 #ifndef SELFTEST
24         b       memcpy_power7
25 #endif
26 #endif
27 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
28 #ifdef __LITTLE_ENDIAN__
29         /* dumb little-endian memcpy that will get replaced at runtime */
30         addi r9,r3,-1
31         addi r4,r4,-1
32         beqlr cr7
33         mtctr r5
34 1:      lbzu r10,1(r4)
35         stbu r10,1(r9)
36         bdnz 1b
37         blr
38 #else
39         PPC_MTOCRF(0x01,r5)
40         cmpldi  cr1,r5,16
41         neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
42         andi.   r6,r6,7
43         dcbt    0,r4
44         blt     cr1,.Lshort_copy
45 /* Below we want to nop out the bne if we're on a CPU that has the
46    CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
47    cleared.
48    At the time of writing the only CPU that has this combination of bits
49    set is Power6. */
50 BEGIN_FTR_SECTION
51         nop
52 FTR_SECTION_ELSE
53         bne     .Ldst_unaligned
54 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
55                     CPU_FTR_UNALIGNED_LD_STD)
56 .Ldst_aligned:
57         addi    r3,r3,-16
58 BEGIN_FTR_SECTION
59         andi.   r0,r4,7
60         bne     .Lsrc_unaligned
61 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
62         srdi    r7,r5,4
63         ld      r9,0(r4)
64         addi    r4,r4,-8
65         mtctr   r7
66         andi.   r5,r5,7
67         bf      cr7*4+0,2f
68         addi    r3,r3,8
69         addi    r4,r4,8
70         mr      r8,r9
71         blt     cr1,3f
72 1:      ld      r9,8(r4)
73         std     r8,8(r3)
74 2:      ldu     r8,16(r4)
75         stdu    r9,16(r3)
76         bdnz    1b
77 3:      std     r8,8(r3)
78         beq     3f
79         addi    r3,r3,16
80 .Ldo_tail:
81         bf      cr7*4+1,1f
82         lwz     r9,8(r4)
83         addi    r4,r4,4
84         stw     r9,0(r3)
85         addi    r3,r3,4
86 1:      bf      cr7*4+2,2f
87         lhz     r9,8(r4)
88         addi    r4,r4,2
89         sth     r9,0(r3)
90         addi    r3,r3,2
91 2:      bf      cr7*4+3,3f
92         lbz     r9,8(r4)
93         stb     r9,0(r3)
94 3:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
95         blr
96
97 .Lsrc_unaligned:
98         srdi    r6,r5,3
99         addi    r5,r5,-16
100         subf    r4,r0,r4
101         srdi    r7,r5,4
102         sldi    r10,r0,3
103         cmpdi   cr6,r6,3
104         andi.   r5,r5,7
105         mtctr   r7
106         subfic  r11,r10,64
107         add     r5,r5,r0
108
109         bt      cr7*4+0,0f
110
111         ld      r9,0(r4)        # 3+2n loads, 2+2n stores
112         ld      r0,8(r4)
113         sld     r6,r9,r10
114         ldu     r9,16(r4)
115         srd     r7,r0,r11
116         sld     r8,r0,r10
117         or      r7,r7,r6
118         blt     cr6,4f
119         ld      r0,8(r4)
120         # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
121         b       2f
122
123 0:      ld      r0,0(r4)        # 4+2n loads, 3+2n stores
124         ldu     r9,8(r4)
125         sld     r8,r0,r10
126         addi    r3,r3,-8
127         blt     cr6,5f
128         ld      r0,8(r4)
129         srd     r12,r9,r11
130         sld     r6,r9,r10
131         ldu     r9,16(r4)
132         or      r12,r8,r12
133         srd     r7,r0,r11
134         sld     r8,r0,r10
135         addi    r3,r3,16
136         beq     cr6,3f
137
138         # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
139 1:      or      r7,r7,r6
140         ld      r0,8(r4)
141         std     r12,8(r3)
142 2:      srd     r12,r9,r11
143         sld     r6,r9,r10
144         ldu     r9,16(r4)
145         or      r12,r8,r12
146         stdu    r7,16(r3)
147         srd     r7,r0,r11
148         sld     r8,r0,r10
149         bdnz    1b
150
151 3:      std     r12,8(r3)
152         or      r7,r7,r6
153 4:      std     r7,16(r3)
154 5:      srd     r12,r9,r11
155         or      r12,r8,r12
156         std     r12,24(r3)
157         beq     4f
158         cmpwi   cr1,r5,8
159         addi    r3,r3,32
160         sld     r9,r9,r10
161         ble     cr1,6f
162         ld      r0,8(r4)
163         srd     r7,r0,r11
164         or      r9,r7,r9
165 6:
166         bf      cr7*4+1,1f
167         rotldi  r9,r9,32
168         stw     r9,0(r3)
169         addi    r3,r3,4
170 1:      bf      cr7*4+2,2f
171         rotldi  r9,r9,16
172         sth     r9,0(r3)
173         addi    r3,r3,2
174 2:      bf      cr7*4+3,3f
175         rotldi  r9,r9,8
176         stb     r9,0(r3)
177 3:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
178         blr
179
180 .Ldst_unaligned:
181         PPC_MTOCRF(0x01,r6)             # put #bytes to 8B bdry into cr7
182         subf    r5,r6,r5
183         li      r7,0
184         cmpldi  cr1,r5,16
185         bf      cr7*4+3,1f
186         lbz     r0,0(r4)
187         stb     r0,0(r3)
188         addi    r7,r7,1
189 1:      bf      cr7*4+2,2f
190         lhzx    r0,r7,r4
191         sthx    r0,r7,r3
192         addi    r7,r7,2
193 2:      bf      cr7*4+1,3f
194         lwzx    r0,r7,r4
195         stwx    r0,r7,r3
196 3:      PPC_MTOCRF(0x01,r5)
197         add     r4,r6,r4
198         add     r3,r6,r3
199         b       .Ldst_aligned
200
201 .Lshort_copy:
202         bf      cr7*4+0,1f
203         lwz     r0,0(r4)
204         lwz     r9,4(r4)
205         addi    r4,r4,8
206         stw     r0,0(r3)
207         stw     r9,4(r3)
208         addi    r3,r3,8
209 1:      bf      cr7*4+1,2f
210         lwz     r0,0(r4)
211         addi    r4,r4,4
212         stw     r0,0(r3)
213         addi    r3,r3,4
214 2:      bf      cr7*4+2,3f
215         lhz     r0,0(r4)
216         addi    r4,r4,2
217         sth     r0,0(r3)
218         addi    r3,r3,2
219 3:      bf      cr7*4+3,4f
220         lbz     r0,0(r4)
221         stb     r0,0(r3)
222 4:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
223         blr
224 #endif
225 EXPORT_SYMBOL(memcpy)