]> asedeno.scripts.mit.edu Git - linux.git/blob - arch/powerpc/lib/copyuser_power7.S
Merge tag 'powerpc-4.17-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
[linux.git] / arch / powerpc / lib / copyuser_power7.S
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  *
16  * Copyright (C) IBM Corporation, 2011
17  *
18  * Author: Anton Blanchard <anton@au.ibm.com>
19  */
20 #include <asm/ppc_asm.h>
21
22 #ifdef __BIG_ENDIAN__
23 #define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
24 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
25 #else
26 #define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
27 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
28 #endif
29
30         .macro err1
31 100:
32         EX_TABLE(100b,.Ldo_err1)
33         .endm
34
35         .macro err2
36 200:
37         EX_TABLE(200b,.Ldo_err2)
38         .endm
39
40 #ifdef CONFIG_ALTIVEC
41         .macro err3
42 300:
43         EX_TABLE(300b,.Ldo_err3)
44         .endm
45
46         .macro err4
47 400:
48         EX_TABLE(400b,.Ldo_err4)
49         .endm
50
51
52 .Ldo_err4:
53         ld      r16,STK_REG(R16)(r1)
54         ld      r15,STK_REG(R15)(r1)
55         ld      r14,STK_REG(R14)(r1)
56 .Ldo_err3:
57         bl      exit_vmx_usercopy
58         ld      r0,STACKFRAMESIZE+16(r1)
59         mtlr    r0
60         b       .Lexit
61 #endif /* CONFIG_ALTIVEC */
62
63 .Ldo_err2:
64         ld      r22,STK_REG(R22)(r1)
65         ld      r21,STK_REG(R21)(r1)
66         ld      r20,STK_REG(R20)(r1)
67         ld      r19,STK_REG(R19)(r1)
68         ld      r18,STK_REG(R18)(r1)
69         ld      r17,STK_REG(R17)(r1)
70         ld      r16,STK_REG(R16)(r1)
71         ld      r15,STK_REG(R15)(r1)
72         ld      r14,STK_REG(R14)(r1)
73 .Lexit:
74         addi    r1,r1,STACKFRAMESIZE
75 .Ldo_err1:
76         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
77         ld      r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
78         ld      r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
79         b       __copy_tofrom_user_base
80
81
82 _GLOBAL(__copy_tofrom_user_power7)
83 #ifdef CONFIG_ALTIVEC
84         cmpldi  r5,16
85         cmpldi  cr1,r5,3328
86
87         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
88         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
89         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
90
91         blt     .Lshort_copy
92         bge     cr1,.Lvmx_copy
93 #else
94         cmpldi  r5,16
95
96         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
97         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
98         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
99
100         blt     .Lshort_copy
101 #endif
102
103 .Lnonvmx_copy:
104         /* Get the source 8B aligned */
105         neg     r6,r4
106         mtocrf  0x01,r6
107         clrldi  r6,r6,(64-3)
108
109         bf      cr7*4+3,1f
110 err1;   lbz     r0,0(r4)
111         addi    r4,r4,1
112 err1;   stb     r0,0(r3)
113         addi    r3,r3,1
114
115 1:      bf      cr7*4+2,2f
116 err1;   lhz     r0,0(r4)
117         addi    r4,r4,2
118 err1;   sth     r0,0(r3)
119         addi    r3,r3,2
120
121 2:      bf      cr7*4+1,3f
122 err1;   lwz     r0,0(r4)
123         addi    r4,r4,4
124 err1;   stw     r0,0(r3)
125         addi    r3,r3,4
126
127 3:      sub     r5,r5,r6
128         cmpldi  r5,128
129         blt     5f
130
131         mflr    r0
132         stdu    r1,-STACKFRAMESIZE(r1)
133         std     r14,STK_REG(R14)(r1)
134         std     r15,STK_REG(R15)(r1)
135         std     r16,STK_REG(R16)(r1)
136         std     r17,STK_REG(R17)(r1)
137         std     r18,STK_REG(R18)(r1)
138         std     r19,STK_REG(R19)(r1)
139         std     r20,STK_REG(R20)(r1)
140         std     r21,STK_REG(R21)(r1)
141         std     r22,STK_REG(R22)(r1)
142         std     r0,STACKFRAMESIZE+16(r1)
143
144         srdi    r6,r5,7
145         mtctr   r6
146
147         /* Now do cacheline (128B) sized loads and stores. */
148         .align  5
149 4:
150 err2;   ld      r0,0(r4)
151 err2;   ld      r6,8(r4)
152 err2;   ld      r7,16(r4)
153 err2;   ld      r8,24(r4)
154 err2;   ld      r9,32(r4)
155 err2;   ld      r10,40(r4)
156 err2;   ld      r11,48(r4)
157 err2;   ld      r12,56(r4)
158 err2;   ld      r14,64(r4)
159 err2;   ld      r15,72(r4)
160 err2;   ld      r16,80(r4)
161 err2;   ld      r17,88(r4)
162 err2;   ld      r18,96(r4)
163 err2;   ld      r19,104(r4)
164 err2;   ld      r20,112(r4)
165 err2;   ld      r21,120(r4)
166         addi    r4,r4,128
167 err2;   std     r0,0(r3)
168 err2;   std     r6,8(r3)
169 err2;   std     r7,16(r3)
170 err2;   std     r8,24(r3)
171 err2;   std     r9,32(r3)
172 err2;   std     r10,40(r3)
173 err2;   std     r11,48(r3)
174 err2;   std     r12,56(r3)
175 err2;   std     r14,64(r3)
176 err2;   std     r15,72(r3)
177 err2;   std     r16,80(r3)
178 err2;   std     r17,88(r3)
179 err2;   std     r18,96(r3)
180 err2;   std     r19,104(r3)
181 err2;   std     r20,112(r3)
182 err2;   std     r21,120(r3)
183         addi    r3,r3,128
184         bdnz    4b
185
186         clrldi  r5,r5,(64-7)
187
188         ld      r14,STK_REG(R14)(r1)
189         ld      r15,STK_REG(R15)(r1)
190         ld      r16,STK_REG(R16)(r1)
191         ld      r17,STK_REG(R17)(r1)
192         ld      r18,STK_REG(R18)(r1)
193         ld      r19,STK_REG(R19)(r1)
194         ld      r20,STK_REG(R20)(r1)
195         ld      r21,STK_REG(R21)(r1)
196         ld      r22,STK_REG(R22)(r1)
197         addi    r1,r1,STACKFRAMESIZE
198
199         /* Up to 127B to go */
200 5:      srdi    r6,r5,4
201         mtocrf  0x01,r6
202
203 6:      bf      cr7*4+1,7f
204 err1;   ld      r0,0(r4)
205 err1;   ld      r6,8(r4)
206 err1;   ld      r7,16(r4)
207 err1;   ld      r8,24(r4)
208 err1;   ld      r9,32(r4)
209 err1;   ld      r10,40(r4)
210 err1;   ld      r11,48(r4)
211 err1;   ld      r12,56(r4)
212         addi    r4,r4,64
213 err1;   std     r0,0(r3)
214 err1;   std     r6,8(r3)
215 err1;   std     r7,16(r3)
216 err1;   std     r8,24(r3)
217 err1;   std     r9,32(r3)
218 err1;   std     r10,40(r3)
219 err1;   std     r11,48(r3)
220 err1;   std     r12,56(r3)
221         addi    r3,r3,64
222
223         /* Up to 63B to go */
224 7:      bf      cr7*4+2,8f
225 err1;   ld      r0,0(r4)
226 err1;   ld      r6,8(r4)
227 err1;   ld      r7,16(r4)
228 err1;   ld      r8,24(r4)
229         addi    r4,r4,32
230 err1;   std     r0,0(r3)
231 err1;   std     r6,8(r3)
232 err1;   std     r7,16(r3)
233 err1;   std     r8,24(r3)
234         addi    r3,r3,32
235
236         /* Up to 31B to go */
237 8:      bf      cr7*4+3,9f
238 err1;   ld      r0,0(r4)
239 err1;   ld      r6,8(r4)
240         addi    r4,r4,16
241 err1;   std     r0,0(r3)
242 err1;   std     r6,8(r3)
243         addi    r3,r3,16
244
245 9:      clrldi  r5,r5,(64-4)
246
247         /* Up to 15B to go */
248 .Lshort_copy:
249         mtocrf  0x01,r5
250         bf      cr7*4+0,12f
251 err1;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
252 err1;   lwz     r6,4(r4)
253         addi    r4,r4,8
254 err1;   stw     r0,0(r3)
255 err1;   stw     r6,4(r3)
256         addi    r3,r3,8
257
258 12:     bf      cr7*4+1,13f
259 err1;   lwz     r0,0(r4)
260         addi    r4,r4,4
261 err1;   stw     r0,0(r3)
262         addi    r3,r3,4
263
264 13:     bf      cr7*4+2,14f
265 err1;   lhz     r0,0(r4)
266         addi    r4,r4,2
267 err1;   sth     r0,0(r3)
268         addi    r3,r3,2
269
270 14:     bf      cr7*4+3,15f
271 err1;   lbz     r0,0(r4)
272 err1;   stb     r0,0(r3)
273
274 15:     li      r3,0
275         blr
276
277 .Lunwind_stack_nonvmx_copy:
278         addi    r1,r1,STACKFRAMESIZE
279         b       .Lnonvmx_copy
280
281 #ifdef CONFIG_ALTIVEC
282 .Lvmx_copy:
283         mflr    r0
284         std     r0,16(r1)
285         stdu    r1,-STACKFRAMESIZE(r1)
286         bl      enter_vmx_usercopy
287         cmpwi   cr1,r3,0
288         ld      r0,STACKFRAMESIZE+16(r1)
289         ld      r3,STK_REG(R31)(r1)
290         ld      r4,STK_REG(R30)(r1)
291         ld      r5,STK_REG(R29)(r1)
292         mtlr    r0
293
294         /*
295          * We prefetch both the source and destination using enhanced touch
296          * instructions. We use a stream ID of 0 for the load side and
297          * 1 for the store side.
298          */
299         clrrdi  r6,r4,7
300         clrrdi  r9,r3,7
301         ori     r9,r9,1         /* stream=1 */
302
303         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
304         cmpldi  r7,0x3FF
305         ble     1f
306         li      r7,0x3FF
307 1:      lis     r0,0x0E00       /* depth=7 */
308         sldi    r7,r7,7
309         or      r7,r7,r0
310         ori     r10,r7,1        /* stream=1 */
311
312         lis     r8,0x8000       /* GO=1 */
313         clrldi  r8,r8,32
314
315         /* setup read stream 0 */
316         dcbt    0,r6,0b01000   /* addr from */
317         dcbt    0,r7,0b01010   /* length and depth from */
318         /* setup write stream 1 */
319         dcbtst  0,r9,0b01000   /* addr to */
320         dcbtst  0,r10,0b01010  /* length and depth to */
321         eieio
322         dcbt    0,r8,0b01010    /* all streams GO */
323
324         beq     cr1,.Lunwind_stack_nonvmx_copy
325
326         /*
327          * If source and destination are not relatively aligned we use a
328          * slower permute loop.
329          */
330         xor     r6,r4,r3
331         rldicl. r6,r6,0,(64-4)
332         bne     .Lvmx_unaligned_copy
333
334         /* Get the destination 16B aligned */
335         neg     r6,r3
336         mtocrf  0x01,r6
337         clrldi  r6,r6,(64-4)
338
339         bf      cr7*4+3,1f
340 err3;   lbz     r0,0(r4)
341         addi    r4,r4,1
342 err3;   stb     r0,0(r3)
343         addi    r3,r3,1
344
345 1:      bf      cr7*4+2,2f
346 err3;   lhz     r0,0(r4)
347         addi    r4,r4,2
348 err3;   sth     r0,0(r3)
349         addi    r3,r3,2
350
351 2:      bf      cr7*4+1,3f
352 err3;   lwz     r0,0(r4)
353         addi    r4,r4,4
354 err3;   stw     r0,0(r3)
355         addi    r3,r3,4
356
357 3:      bf      cr7*4+0,4f
358 err3;   ld      r0,0(r4)
359         addi    r4,r4,8
360 err3;   std     r0,0(r3)
361         addi    r3,r3,8
362
363 4:      sub     r5,r5,r6
364
365         /* Get the desination 128B aligned */
366         neg     r6,r3
367         srdi    r7,r6,4
368         mtocrf  0x01,r7
369         clrldi  r6,r6,(64-7)
370
371         li      r9,16
372         li      r10,32
373         li      r11,48
374
375         bf      cr7*4+3,5f
376 err3;   lvx     v1,0,r4
377         addi    r4,r4,16
378 err3;   stvx    v1,0,r3
379         addi    r3,r3,16
380
381 5:      bf      cr7*4+2,6f
382 err3;   lvx     v1,0,r4
383 err3;   lvx     v0,r4,r9
384         addi    r4,r4,32
385 err3;   stvx    v1,0,r3
386 err3;   stvx    v0,r3,r9
387         addi    r3,r3,32
388
389 6:      bf      cr7*4+1,7f
390 err3;   lvx     v3,0,r4
391 err3;   lvx     v2,r4,r9
392 err3;   lvx     v1,r4,r10
393 err3;   lvx     v0,r4,r11
394         addi    r4,r4,64
395 err3;   stvx    v3,0,r3
396 err3;   stvx    v2,r3,r9
397 err3;   stvx    v1,r3,r10
398 err3;   stvx    v0,r3,r11
399         addi    r3,r3,64
400
401 7:      sub     r5,r5,r6
402         srdi    r6,r5,7
403
404         std     r14,STK_REG(R14)(r1)
405         std     r15,STK_REG(R15)(r1)
406         std     r16,STK_REG(R16)(r1)
407
408         li      r12,64
409         li      r14,80
410         li      r15,96
411         li      r16,112
412
413         mtctr   r6
414
415         /*
416          * Now do cacheline sized loads and stores. By this stage the
417          * cacheline stores are also cacheline aligned.
418          */
419         .align  5
420 8:
421 err4;   lvx     v7,0,r4
422 err4;   lvx     v6,r4,r9
423 err4;   lvx     v5,r4,r10
424 err4;   lvx     v4,r4,r11
425 err4;   lvx     v3,r4,r12
426 err4;   lvx     v2,r4,r14
427 err4;   lvx     v1,r4,r15
428 err4;   lvx     v0,r4,r16
429         addi    r4,r4,128
430 err4;   stvx    v7,0,r3
431 err4;   stvx    v6,r3,r9
432 err4;   stvx    v5,r3,r10
433 err4;   stvx    v4,r3,r11
434 err4;   stvx    v3,r3,r12
435 err4;   stvx    v2,r3,r14
436 err4;   stvx    v1,r3,r15
437 err4;   stvx    v0,r3,r16
438         addi    r3,r3,128
439         bdnz    8b
440
441         ld      r14,STK_REG(R14)(r1)
442         ld      r15,STK_REG(R15)(r1)
443         ld      r16,STK_REG(R16)(r1)
444
445         /* Up to 127B to go */
446         clrldi  r5,r5,(64-7)
447         srdi    r6,r5,4
448         mtocrf  0x01,r6
449
450         bf      cr7*4+1,9f
451 err3;   lvx     v3,0,r4
452 err3;   lvx     v2,r4,r9
453 err3;   lvx     v1,r4,r10
454 err3;   lvx     v0,r4,r11
455         addi    r4,r4,64
456 err3;   stvx    v3,0,r3
457 err3;   stvx    v2,r3,r9
458 err3;   stvx    v1,r3,r10
459 err3;   stvx    v0,r3,r11
460         addi    r3,r3,64
461
462 9:      bf      cr7*4+2,10f
463 err3;   lvx     v1,0,r4
464 err3;   lvx     v0,r4,r9
465         addi    r4,r4,32
466 err3;   stvx    v1,0,r3
467 err3;   stvx    v0,r3,r9
468         addi    r3,r3,32
469
470 10:     bf      cr7*4+3,11f
471 err3;   lvx     v1,0,r4
472         addi    r4,r4,16
473 err3;   stvx    v1,0,r3
474         addi    r3,r3,16
475
476         /* Up to 15B to go */
477 11:     clrldi  r5,r5,(64-4)
478         mtocrf  0x01,r5
479         bf      cr7*4+0,12f
480 err3;   ld      r0,0(r4)
481         addi    r4,r4,8
482 err3;   std     r0,0(r3)
483         addi    r3,r3,8
484
485 12:     bf      cr7*4+1,13f
486 err3;   lwz     r0,0(r4)
487         addi    r4,r4,4
488 err3;   stw     r0,0(r3)
489         addi    r3,r3,4
490
491 13:     bf      cr7*4+2,14f
492 err3;   lhz     r0,0(r4)
493         addi    r4,r4,2
494 err3;   sth     r0,0(r3)
495         addi    r3,r3,2
496
497 14:     bf      cr7*4+3,15f
498 err3;   lbz     r0,0(r4)
499 err3;   stb     r0,0(r3)
500
501 15:     addi    r1,r1,STACKFRAMESIZE
502         b       exit_vmx_usercopy       /* tail call optimise */
503
504 .Lvmx_unaligned_copy:
505         /* Get the destination 16B aligned */
506         neg     r6,r3
507         mtocrf  0x01,r6
508         clrldi  r6,r6,(64-4)
509
510         bf      cr7*4+3,1f
511 err3;   lbz     r0,0(r4)
512         addi    r4,r4,1
513 err3;   stb     r0,0(r3)
514         addi    r3,r3,1
515
516 1:      bf      cr7*4+2,2f
517 err3;   lhz     r0,0(r4)
518         addi    r4,r4,2
519 err3;   sth     r0,0(r3)
520         addi    r3,r3,2
521
522 2:      bf      cr7*4+1,3f
523 err3;   lwz     r0,0(r4)
524         addi    r4,r4,4
525 err3;   stw     r0,0(r3)
526         addi    r3,r3,4
527
528 3:      bf      cr7*4+0,4f
529 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
530 err3;   lwz     r7,4(r4)
531         addi    r4,r4,8
532 err3;   stw     r0,0(r3)
533 err3;   stw     r7,4(r3)
534         addi    r3,r3,8
535
536 4:      sub     r5,r5,r6
537
538         /* Get the desination 128B aligned */
539         neg     r6,r3
540         srdi    r7,r6,4
541         mtocrf  0x01,r7
542         clrldi  r6,r6,(64-7)
543
544         li      r9,16
545         li      r10,32
546         li      r11,48
547
548         LVS(v16,0,r4)           /* Setup permute control vector */
549 err3;   lvx     v0,0,r4
550         addi    r4,r4,16
551
552         bf      cr7*4+3,5f
553 err3;   lvx     v1,0,r4
554         VPERM(v8,v0,v1,v16)
555         addi    r4,r4,16
556 err3;   stvx    v8,0,r3
557         addi    r3,r3,16
558         vor     v0,v1,v1
559
560 5:      bf      cr7*4+2,6f
561 err3;   lvx     v1,0,r4
562         VPERM(v8,v0,v1,v16)
563 err3;   lvx     v0,r4,r9
564         VPERM(v9,v1,v0,v16)
565         addi    r4,r4,32
566 err3;   stvx    v8,0,r3
567 err3;   stvx    v9,r3,r9
568         addi    r3,r3,32
569
570 6:      bf      cr7*4+1,7f
571 err3;   lvx     v3,0,r4
572         VPERM(v8,v0,v3,v16)
573 err3;   lvx     v2,r4,r9
574         VPERM(v9,v3,v2,v16)
575 err3;   lvx     v1,r4,r10
576         VPERM(v10,v2,v1,v16)
577 err3;   lvx     v0,r4,r11
578         VPERM(v11,v1,v0,v16)
579         addi    r4,r4,64
580 err3;   stvx    v8,0,r3
581 err3;   stvx    v9,r3,r9
582 err3;   stvx    v10,r3,r10
583 err3;   stvx    v11,r3,r11
584         addi    r3,r3,64
585
586 7:      sub     r5,r5,r6
587         srdi    r6,r5,7
588
589         std     r14,STK_REG(R14)(r1)
590         std     r15,STK_REG(R15)(r1)
591         std     r16,STK_REG(R16)(r1)
592
593         li      r12,64
594         li      r14,80
595         li      r15,96
596         li      r16,112
597
598         mtctr   r6
599
600         /*
601          * Now do cacheline sized loads and stores. By this stage the
602          * cacheline stores are also cacheline aligned.
603          */
604         .align  5
605 8:
606 err4;   lvx     v7,0,r4
607         VPERM(v8,v0,v7,v16)
608 err4;   lvx     v6,r4,r9
609         VPERM(v9,v7,v6,v16)
610 err4;   lvx     v5,r4,r10
611         VPERM(v10,v6,v5,v16)
612 err4;   lvx     v4,r4,r11
613         VPERM(v11,v5,v4,v16)
614 err4;   lvx     v3,r4,r12
615         VPERM(v12,v4,v3,v16)
616 err4;   lvx     v2,r4,r14
617         VPERM(v13,v3,v2,v16)
618 err4;   lvx     v1,r4,r15
619         VPERM(v14,v2,v1,v16)
620 err4;   lvx     v0,r4,r16
621         VPERM(v15,v1,v0,v16)
622         addi    r4,r4,128
623 err4;   stvx    v8,0,r3
624 err4;   stvx    v9,r3,r9
625 err4;   stvx    v10,r3,r10
626 err4;   stvx    v11,r3,r11
627 err4;   stvx    v12,r3,r12
628 err4;   stvx    v13,r3,r14
629 err4;   stvx    v14,r3,r15
630 err4;   stvx    v15,r3,r16
631         addi    r3,r3,128
632         bdnz    8b
633
634         ld      r14,STK_REG(R14)(r1)
635         ld      r15,STK_REG(R15)(r1)
636         ld      r16,STK_REG(R16)(r1)
637
638         /* Up to 127B to go */
639         clrldi  r5,r5,(64-7)
640         srdi    r6,r5,4
641         mtocrf  0x01,r6
642
643         bf      cr7*4+1,9f
644 err3;   lvx     v3,0,r4
645         VPERM(v8,v0,v3,v16)
646 err3;   lvx     v2,r4,r9
647         VPERM(v9,v3,v2,v16)
648 err3;   lvx     v1,r4,r10
649         VPERM(v10,v2,v1,v16)
650 err3;   lvx     v0,r4,r11
651         VPERM(v11,v1,v0,v16)
652         addi    r4,r4,64
653 err3;   stvx    v8,0,r3
654 err3;   stvx    v9,r3,r9
655 err3;   stvx    v10,r3,r10
656 err3;   stvx    v11,r3,r11
657         addi    r3,r3,64
658
659 9:      bf      cr7*4+2,10f
660 err3;   lvx     v1,0,r4
661         VPERM(v8,v0,v1,v16)
662 err3;   lvx     v0,r4,r9
663         VPERM(v9,v1,v0,v16)
664         addi    r4,r4,32
665 err3;   stvx    v8,0,r3
666 err3;   stvx    v9,r3,r9
667         addi    r3,r3,32
668
669 10:     bf      cr7*4+3,11f
670 err3;   lvx     v1,0,r4
671         VPERM(v8,v0,v1,v16)
672         addi    r4,r4,16
673 err3;   stvx    v8,0,r3
674         addi    r3,r3,16
675
676         /* Up to 15B to go */
677 11:     clrldi  r5,r5,(64-4)
678         addi    r4,r4,-16       /* Unwind the +16 load offset */
679         mtocrf  0x01,r5
680         bf      cr7*4+0,12f
681 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
682 err3;   lwz     r6,4(r4)
683         addi    r4,r4,8
684 err3;   stw     r0,0(r3)
685 err3;   stw     r6,4(r3)
686         addi    r3,r3,8
687
688 12:     bf      cr7*4+1,13f
689 err3;   lwz     r0,0(r4)
690         addi    r4,r4,4
691 err3;   stw     r0,0(r3)
692         addi    r3,r3,4
693
694 13:     bf      cr7*4+2,14f
695 err3;   lhz     r0,0(r4)
696         addi    r4,r4,2
697 err3;   sth     r0,0(r3)
698         addi    r3,r3,2
699
700 14:     bf      cr7*4+3,15f
701 err3;   lbz     r0,0(r4)
702 err3;   stb     r0,0(r3)
703
704 15:     addi    r1,r1,STACKFRAMESIZE
705         b       exit_vmx_usercopy       /* tail call optimise */
706 #endif /* CONFIG_ALTIVEC */