- Update Xen patches to 3.3-rc5 and c/s 1157.
[linux-flexiantxendom0-3.2.10.git] / arch / x86 / include / mach-xen / asm / xor_64.h
1 #ifndef _ASM_X86_XOR_64_H
2 #define _ASM_X86_XOR_64_H
3
4 #include <asm/i387.h>
5
6 /*
7  * x86-64 changes / gcc fixes from Andi Kleen.
8  * Copyright 2002 Andi Kleen, SuSE Labs.
9  *
10  * This hasn't been optimized for the hammer yet, but there are likely
11  * no advantages to be gotten from x86-64 here anyways.
12  */
13
14 typedef struct {
15         unsigned long a, b;
16 } __attribute__((aligned(16))) xmm_store_t;
17
18 /* Doesn't use gcc to save the XMM registers, because there is no easy way to
19    tell it to do a clts before the register saving. */
20 #define XMMS_SAVE                               \
21 do {                                            \
22         preempt_disable();                      \
23         if (!__thread_has_fpu(current))         \
24                 clts();                         \
25         asm volatile(                           \
26                 "movups %%xmm0,(%1)     ;\n\t"  \
27                 "movups %%xmm1,0x10(%1) ;\n\t"  \
28                 "movups %%xmm2,0x20(%1) ;\n\t"  \
29                 "movups %%xmm3,0x30(%1) ;\n\t"  \
30                 : "=&r" (cr0)                   \
31                 : "r" (xmm_save)                \
32                 : "memory");                    \
33 } while (0)
34
35 #define XMMS_RESTORE                            \
36 do {                                            \
37         asm volatile(                           \
38                 "sfence                 ;\n\t"  \
39                 "movups (%1),%%xmm0     ;\n\t"  \
40                 "movups 0x10(%1),%%xmm1 ;\n\t"  \
41                 "movups 0x20(%1),%%xmm2 ;\n\t"  \
42                 "movups 0x30(%1),%%xmm3 ;\n\t"  \
43                 :                               \
44                 : "r" (cr0), "r" (xmm_save)     \
45                 : "memory");                    \
46         if (!__thread_has_fpu(current))         \
47                 stts();                         \
48         preempt_enable();                       \
49 } while (0)
50
51 #define OFFS(x)         "16*("#x")"
52 #define PF_OFFS(x)      "256+16*("#x")"
53 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
54 #define LD(x, y)        "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
55 #define ST(x, y)        "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
56 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
57 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
58 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
59 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
60 #define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
61 #define XO1(x, y)       "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
62 #define XO2(x, y)       "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
63 #define XO3(x, y)       "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
64 #define XO4(x, y)       "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
65 #define XO5(x, y)       "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
66
67
68 static void
69 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
70 {
71         unsigned int lines = bytes >> 8;
72         unsigned long cr0;
73         xmm_store_t xmm_save[4];
74
75         XMMS_SAVE;
76
77         asm volatile(
78 #undef BLOCK
79 #define BLOCK(i) \
80                 LD(i, 0)                                \
81                         LD(i + 1, 1)                    \
82                 PF1(i)                                  \
83                                 PF1(i + 2)              \
84                                 LD(i + 2, 2)            \
85                                         LD(i + 3, 3)    \
86                 PF0(i + 4)                              \
87                                 PF0(i + 6)              \
88                 XO1(i, 0)                               \
89                         XO1(i + 1, 1)                   \
90                                 XO1(i + 2, 2)           \
91                                         XO1(i + 3, 3)   \
92                 ST(i, 0)                                \
93                         ST(i + 1, 1)                    \
94                                 ST(i + 2, 2)            \
95                                         ST(i + 3, 3)    \
96
97
98                 PF0(0)
99                                 PF0(2)
100
101         " .align 32                     ;\n"
102         " 1:                            ;\n"
103
104                 BLOCK(0)
105                 BLOCK(4)
106                 BLOCK(8)
107                 BLOCK(12)
108
109         "       addq %[inc], %[p1]           ;\n"
110         "       addq %[inc], %[p2]           ;\n"
111                 "               decl %[cnt] ; jnz 1b"
112         : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
113         : [inc] "r" (256UL)
114         : "memory");
115
116         XMMS_RESTORE;
117 }
118
119 static void
120 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
121           unsigned long *p3)
122 {
123         unsigned int lines = bytes >> 8;
124         xmm_store_t xmm_save[4];
125         unsigned long cr0;
126
127         XMMS_SAVE;
128
129         asm volatile(
130 #undef BLOCK
131 #define BLOCK(i) \
132                 PF1(i)                                  \
133                                 PF1(i + 2)              \
134                 LD(i, 0)                                        \
135                         LD(i + 1, 1)                    \
136                                 LD(i + 2, 2)            \
137                                         LD(i + 3, 3)    \
138                 PF2(i)                                  \
139                                 PF2(i + 2)              \
140                 PF0(i + 4)                              \
141                                 PF0(i + 6)              \
142                 XO1(i, 0)                               \
143                         XO1(i + 1, 1)                   \
144                                 XO1(i + 2, 2)           \
145                                         XO1(i + 3, 3)   \
146                 XO2(i, 0)                               \
147                         XO2(i + 1, 1)                   \
148                                 XO2(i + 2, 2)           \
149                                         XO2(i + 3, 3)   \
150                 ST(i, 0)                                \
151                         ST(i + 1, 1)                    \
152                                 ST(i + 2, 2)            \
153                                         ST(i + 3, 3)    \
154
155
156                 PF0(0)
157                                 PF0(2)
158
159         " .align 32                     ;\n"
160         " 1:                            ;\n"
161
162                 BLOCK(0)
163                 BLOCK(4)
164                 BLOCK(8)
165                 BLOCK(12)
166
167         "       addq %[inc], %[p1]           ;\n"
168         "       addq %[inc], %[p2]          ;\n"
169         "       addq %[inc], %[p3]           ;\n"
170                 "               decl %[cnt] ; jnz 1b"
171         : [cnt] "+r" (lines),
172           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
173         : [inc] "r" (256UL)
174         : "memory");
175         XMMS_RESTORE;
176 }
177
178 static void
179 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180           unsigned long *p3, unsigned long *p4)
181 {
182         unsigned int lines = bytes >> 8;
183         xmm_store_t xmm_save[4];
184         unsigned long cr0;
185
186         XMMS_SAVE;
187
188         asm volatile(
189 #undef BLOCK
190 #define BLOCK(i) \
191                 PF1(i)                                  \
192                                 PF1(i + 2)              \
193                 LD(i, 0)                                \
194                         LD(i + 1, 1)                    \
195                                 LD(i + 2, 2)            \
196                                         LD(i + 3, 3)    \
197                 PF2(i)                                  \
198                                 PF2(i + 2)              \
199                 XO1(i, 0)                               \
200                         XO1(i + 1, 1)                   \
201                                 XO1(i + 2, 2)           \
202                                         XO1(i + 3, 3)   \
203                 PF3(i)                                  \
204                                 PF3(i + 2)              \
205                 PF0(i + 4)                              \
206                                 PF0(i + 6)              \
207                 XO2(i, 0)                               \
208                         XO2(i + 1, 1)                   \
209                                 XO2(i + 2, 2)           \
210                                         XO2(i + 3, 3)   \
211                 XO3(i, 0)                               \
212                         XO3(i + 1, 1)                   \
213                                 XO3(i + 2, 2)           \
214                                         XO3(i + 3, 3)   \
215                 ST(i, 0)                                \
216                         ST(i + 1, 1)                    \
217                                 ST(i + 2, 2)            \
218                                         ST(i + 3, 3)    \
219
220
221                 PF0(0)
222                                 PF0(2)
223
224         " .align 32                     ;\n"
225         " 1:                            ;\n"
226
227                 BLOCK(0)
228                 BLOCK(4)
229                 BLOCK(8)
230                 BLOCK(12)
231
232         "       addq %[inc], %[p1]           ;\n"
233         "       addq %[inc], %[p2]           ;\n"
234         "       addq %[inc], %[p3]           ;\n"
235         "       addq %[inc], %[p4]           ;\n"
236         "       decl %[cnt] ; jnz 1b"
237         : [cnt] "+c" (lines),
238           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
239         : [inc] "r" (256UL)
240         : "memory" );
241
242         XMMS_RESTORE;
243 }
244
245 static void
246 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
247           unsigned long *p3, unsigned long *p4, unsigned long *p5)
248 {
249         unsigned int lines = bytes >> 8;
250         xmm_store_t xmm_save[4];
251         unsigned long cr0;
252
253         XMMS_SAVE;
254
255         asm volatile(
256 #undef BLOCK
257 #define BLOCK(i) \
258                 PF1(i)                                  \
259                                 PF1(i + 2)              \
260                 LD(i, 0)                                \
261                         LD(i + 1, 1)                    \
262                                 LD(i + 2, 2)            \
263                                         LD(i + 3, 3)    \
264                 PF2(i)                                  \
265                                 PF2(i + 2)              \
266                 XO1(i, 0)                               \
267                         XO1(i + 1, 1)                   \
268                                 XO1(i + 2, 2)           \
269                                         XO1(i + 3, 3)   \
270                 PF3(i)                                  \
271                                 PF3(i + 2)              \
272                 XO2(i, 0)                               \
273                         XO2(i + 1, 1)                   \
274                                 XO2(i + 2, 2)           \
275                                         XO2(i + 3, 3)   \
276                 PF4(i)                                  \
277                                 PF4(i + 2)              \
278                 PF0(i + 4)                              \
279                                 PF0(i + 6)              \
280                 XO3(i, 0)                               \
281                         XO3(i + 1, 1)                   \
282                                 XO3(i + 2, 2)           \
283                                         XO3(i + 3, 3)   \
284                 XO4(i, 0)                               \
285                         XO4(i + 1, 1)                   \
286                                 XO4(i + 2, 2)           \
287                                         XO4(i + 3, 3)   \
288                 ST(i, 0)                                \
289                         ST(i + 1, 1)                    \
290                                 ST(i + 2, 2)            \
291                                         ST(i + 3, 3)    \
292
293
294                 PF0(0)
295                                 PF0(2)
296
297         " .align 32                     ;\n"
298         " 1:                            ;\n"
299
300                 BLOCK(0)
301                 BLOCK(4)
302                 BLOCK(8)
303                 BLOCK(12)
304
305         "       addq %[inc], %[p1]           ;\n"
306         "       addq %[inc], %[p2]           ;\n"
307         "       addq %[inc], %[p3]           ;\n"
308         "       addq %[inc], %[p4]           ;\n"
309         "       addq %[inc], %[p5]           ;\n"
310         "       decl %[cnt] ; jnz 1b"
311         : [cnt] "+c" (lines),
312           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
313           [p5] "+r" (p5)
314         : [inc] "r" (256UL)
315         : "memory");
316
317         XMMS_RESTORE;
318 }
319
320 static struct xor_block_template xor_block_sse = {
321         .name = "generic_sse",
322         .do_2 = xor_sse_2,
323         .do_3 = xor_sse_3,
324         .do_4 = xor_sse_4,
325         .do_5 = xor_sse_5,
326 };
327
328 #undef XOR_TRY_TEMPLATES
329 #define XOR_TRY_TEMPLATES                       \
330 do {                                            \
331         xor_speed(&xor_block_sse);              \
332 } while (0)
333
334 /* We force the use of the SSE xor block because it can write around L2.
335    We may also be able to load into the L1 only depending on how the cpu
336    deals with a load to a line that is being prefetched.  */
337 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
338
339 #endif /* _ASM_X86_XOR_64_H */