2 * This file contains the light-weight system call handlers (fsyscall-handlers).
4 * Copyright (C) 2003 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com>
7 * 18-Feb-03 louisk Implement fsys_gettimeofday().
8 * 28-Feb-03 davidm Fixed several bugs in fsys_gettimeofday(). Tuned it some more,
9 * probably broke it along the way... ;-)
12 #include <asm/asmmacro.h>
13 #include <asm/errno.h>
14 #include <asm/offsets.h>
15 #include <asm/percpu.h>
16 #include <asm/thread_info.h>
18 #include <asm/system.h>
19 #include <asm/unistd.h>
24 * See Documentation/ia64/fsys.txt for details on fsyscalls.
26 * On entry to an fsyscall handler:
27 * r10 = 0 (i.e., defaults to "successful syscall return")
28 * r11 = saved ar.pfs (a user-level value)
29 * r15 = system call number
30 * r16 = "current" task pointer (in normal kernel-mode, this is in r13)
31 * r32-r39 = system call arguments
32 * b6 = return address (a user-level value)
33 * ar.pfs = previous frame-state (a user-level value)
34 * PSR.be = cleared to zero (i.e., little-endian byte order is in effect)
35 * all other registers may contain values passed in from user-mode
37 * On return from an fsyscall handler:
38 * r11 = saved ar.pfs (as passed into the fsyscall handler)
39 * r15 = system call number (as passed into the fsyscall handler)
40 * r32-r39 = system call arguments (as passed into the fsyscall handler)
41 * b6 = return address (as passed into the fsyscall handler)
42 * ar.pfs = previous frame-state (as passed into the fsyscall handler)
45 ENTRY(fsys_ni_syscall)
51 MCKINLEY_E9_WORKAROUND
59 add r9=TI_FLAGS+IA64_TASK_SIZE,r16
62 add r8=IA64_TASK_TGID_OFFSET,r16
64 and r9=TIF_ALLWORK_MASK,r9
65 ld4 r8=[r8] // r8 = current->tgid
68 (p8) br.spnt.many fsys_fallback_syscall
69 MCKINLEY_E9_WORKAROUND
77 add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16
79 ld8 r17=[r17] // r17 = current->group_leader
80 add r9=TI_FLAGS+IA64_TASK_SIZE,r16
84 add r17=IA64_TASK_REAL_PARENT_OFFSET,r17 // r17 = ¤t->group_leader->real_parent
86 and r9=TIF_ALLWORK_MASK,r9
88 1: ld8 r18=[r17] // r18 = current->group_leader->real_parent
91 add r8=IA64_TASK_TGID_OFFSET,r18 // r8 = ¤t->group_leader->real_parent->tgid
95 * The .acq is needed to ensure that the read of tgid has returned its data before
96 * we re-check "real_parent".
98 ld4.acq r8=[r8] // r8 = current->group_leader->real_parent->tgid
101 * Re-read current->group_leader->real_parent.
103 ld8 r19=[r17] // r19 = current->group_leader->real_parent
104 (p8) br.spnt.many fsys_fallback_syscall
106 cmp.ne p6,p0=r18,r19 // did real_parent change?
107 mov r19=0 // i must not leak kernel bits...
108 (p6) br.cond.spnt.few 1b // yes -> redo the read of tgid and the check
110 mov r17=0 // i must not leak kernel bits...
111 mov r18=0 // i must not leak kernel bits...
113 mov r17=0 // i must not leak kernel bits...
114 mov r18=0 // i must not leak kernel bits...
115 mov r19=0 // i must not leak kernel bits...
117 MCKINLEY_E9_WORKAROUND
121 ENTRY(fsys_set_tid_address)
125 add r9=TI_FLAGS+IA64_TASK_SIZE,r16
128 tnat.z p6,p7=r32 // check argument register for being NaT
130 and r9=TIF_ALLWORK_MASK,r9
131 add r8=IA64_TASK_PID_OFFSET,r16
132 add r18=IA64_TASK_CLEAR_CHILD_TID_OFFSET,r16
140 (p8) br.spnt.many fsys_fallback_syscall
142 mov r17=0 // i must not leak kernel bits...
143 mov r18=0 // i must not leak kernel bits...
144 MCKINLEY_E9_WORKAROUND
146 END(fsys_set_tid_address)
149 * Note 1: This routine uses floating-point registers, but only with registers that
150 * operate on integers. Because of that, we don't need to set ar.fpsr to the
151 * kernel default value.
153 * Note 2: For now, we will assume that all CPUs run at the same clock-frequency.
154 * If that wasn't the case, we would have to disable preemption (e.g.,
155 * by disabling interrupts) between reading the ITC and reading
156 * local_cpu_data->nsec_per_cyc.
158 * Note 3: On platforms where the ITC-drift bit is set in the SAL feature vector,
159 * we ought to either skip the ITC-based interpolation or run an ntp-like
160 * daemon to keep the ITCs from drifting too far apart.
163 ENTRY(fsys_gettimeofday)
167 add r9=TI_FLAGS+IA64_TASK_SIZE,r16
168 movl r3=THIS_CPU(cpu_info)
170 mov.m r31=ar.itc // put time stamp into r31 (ITC) == now (35 cyc)
172 movl r10=__per_cpu_offset
173 movl r2=sal_platform_features
177 movl r19=xtime // xtime is a timespec struct
179 ld8 r10=[r10] // r10 <- __per_cpu_offset[0]
180 movl r21=THIS_CPU(cpu_info)
182 add r10=r21, r10 // r10 <- &cpu_data(time_keeper_id)
183 tbit.nz p8,p0 = r2, IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT_BIT
184 (p8) br.spnt.many fsys_fallback_syscall
188 movl r19=xtime // xtime is a timespec struct
194 // r32, r33 should contain the 2 args of gettimeofday
195 adds r21=IA64_CPUINFO_ITM_NEXT_OFFSET, r10
197 tnat.nz p6,p7=r32 // guard against NaT args
200 adds r10=IA64_CPUINFO_ITM_DELTA_OFFSET, r10
201 (p7) tnat.nz p6,p0=r33
202 (p6) br.cond.spnt.few .fail
204 adds r8=IA64_CPUINFO_NSEC_PER_CYC_OFFSET, r3
205 movl r24=2361183241434822607 // for division hack (only for / 1000)
208 ldf8 f7=[r10] // f7 now contains itm_delta
212 adds r20=IA64_TIMESPEC_TV_NSEC_OFFSET, r19 // r20 = &xtime->tv_nsec
215 setf.sig f9=r24 // f9 is used for division hack
216 movl r27=wall_jiffies
218 and r9=TIF_ALLWORK_MASK,r9
219 movl r25=last_nsec_offset
223 * Verify that we have permission to write to struct timeval. Note:
224 * Another thread might unmap the mapping before we actually get
225 * to store the result. That's OK as long as the stores are also
228 EX(.fail, probe.w.fault r32, 3) // this must come _after_ NaT-check
229 EX(.fail, probe.w.fault r10, 3) // this must come _after_ NaT-check
232 ldf8 f10=[r8] // f10 <- local_cpu_data->nsec_per_cyc value
234 (p8) br.spnt.many fsys_fallback_syscall
236 .retry: // *** seq = read_seqbegin(&xtime_lock); ***
237 ld4.acq r23=[r17] // since &xtime_lock == &xtime_lock->sequence
238 ld8 r14=[r25] // r14 (old) = last_nsec_offset
240 ld8 r28=[r26] // r28 = jiffies
241 ld8 r29=[r27] // r29 = wall_jiffies
244 ldf8 f8=[r21] // f8 now contains itm_next
245 sub r28=r29, r28, 1 // r28 now contains "-(lost + 1)"
246 tbit.nz p9, p10=r23, 0 // p9 <- is_odd(r23), p10 <- is_even(r23)
249 ld8 r2=[r19] // r2 = sec = xtime.tv_sec
250 ld8 r29=[r20] // r29 = nsec = xtime.tv_nsec
252 setf.sig f6=r28 // f6 <- -(lost + 1) (6 cyc)
256 xma.l f8=f6, f7, f8 // f8 (last_tick) <- -(lost + 1)*itm_delta + itm_next (5 cyc)
259 setf.sig f12=r31 // f12 <- ITC (6 cyc)
260 // *** if (unlikely(read_seqretry(&xtime_lock, seq))) continue; ***
261 ld4 r24=[r17] // r24 = xtime_lock->sequence (re-read)
265 mov r31=ar.itc // re-read ITC in case we .retry (35 cyc)
266 xma.l f8=f11, f8, f12 // f8 (elapsed_cycles) <- (-1*last_tick + now) = (now - last_tick)
270 getf.sig r18=f8 // r18 <- (now - last_tick)
271 xmpy.l f8=f8, f10 // f8 <- elapsed_cycles*nsec_per_cyc (5 cyc)
272 add r3=r29, r14 // r3 = (nsec + old)
275 cmp.lt p7, p8=r18, r0 // if now < last_tick, set p7 = 1, p8 = 0
276 getf.sig r18=f8 // r18 = elapsed_cycles*nsec_per_cyc (6 cyc)
280 (p10) cmp.ne p9, p0=r23, r24 // if xtime_lock->sequence != seq, set p9
281 shr.u r18=r18, IA64_NSEC_PER_CYC_SHIFT // r18 <- offset
282 (p9) br.spnt.many .retry
285 mov ar.ccv=r14 // ar.ccv = old (1 cyc)
286 cmp.leu p7, p8=r18, r14 // if (offset <= old), set p7 = 1, p8 = 0
289 (p8) cmpxchg8.rel r24=[r25], r18, ar.ccv // compare-and-exchange (atomic!)
290 (p8) add r3=r29, r18 // r3 = (nsec + offset)
292 shr.u r3=r3, 3 // initiate dividing r3 by 1000
294 setf.sig f8=r3 // (6 cyc)
295 mov r10=1000000 // r10 = 1000000
297 (p8) cmp.ne.unc p9, p0=r24, r14
298 xmpy.hu f6=f8, f9 // (5 cyc)
299 (p9) br.spnt.many .retry
302 getf.sig r3=f6 // (6 cyc)
304 shr.u r3=r3, 4 // end of division, r3 is divided by 1000 (=usec)
307 1: cmp.geu p7, p0=r3, r10 // while (usec >= 1000000)
309 (p7) sub r3=r3, r10 // usec -= 1000000
310 (p7) adds r2=1, r2 // ++sec
313 // finally: r2 = sec, r3 = usec
314 EX(.fail, st8 [r32]=r2)
318 EX(.fail, st8 [r9]=r3) // store them in the timeval struct
320 MCKINLEY_E9_WORKAROUND
321 br.ret.sptk.many b6 // return to caller
323 * Note: We are NOT clearing the scratch registers here. Since the only things
324 * in those registers are time-related variables and some addresses (which
325 * can be obtained from System.map), none of this should be security-sensitive
326 * and we should be fine.
329 .fail: adds r8=EINVAL, r0 // r8 = EINVAL
330 adds r10=-1, r0 // r10 = -1
331 MCKINLEY_E9_WORKAROUND
332 br.ret.spnt.many b6 // return with r8 set to EINVAL
333 END(fsys_gettimeofday)
335 ENTRY(fsys_fallback_syscall)
340 * We only get here from light-weight syscall handlers. Thus, we already
341 * know that r15 contains a valid syscall number. No need to re-check.
344 movl r14=sys_call_table
348 ld8 r18=[r18] // load normal (heavy-weight) syscall entry-point
349 mov r29=psr // read psr (12 cyc load latency)
353 END(fsys_fallback_syscall)
355 GLOBAL_ENTRY(fsys_bubble_down)
360 * We get here for syscalls that don't have a lightweight handler. For those, we
361 * need to bubble down into the kernel and that requires setting up a minimal
362 * pt_regs structure, and initializing the CPU state more or less as if an
363 * interruption had occurred. To make syscall-restarts work, we setup pt_regs
364 * such that cr_iip points to the second instruction in syscall_via_break.
365 * Decrementing the IP hence will restart the syscall via break and not
366 * decrementing IP will return us to the caller, as usual. Note that we preserve
367 * the value of psr.pp rather than initializing it from dcr.pp. This makes it
368 * possible to distinguish fsyscall execution from other privileged execution.
371 * - normal fsyscall handler register usage, except that we also have:
372 * - r18: address of syscall entry point
378 # define PSR_PRESERVED_BITS (IA64_PSR_UP | IA64_PSR_MFL | IA64_PSR_MFH | IA64_PSR_PK \
379 | IA64_PSR_DT | IA64_PSR_PP | IA64_PSR_SP | IA64_PSR_RT \
382 * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc. The rest we have
385 # define PSR_ONE_BITS ((3 << IA64_PSR_CPL0_BIT) | (0x1 << IA64_PSR_RI_BIT) \
391 mov r25=ar.unat // save ar.unat (5 cyc)
392 movl r9=PSR_PRESERVED_BITS
394 mov ar.rsc=0 // set enforced lazy mode, pl 0, little-endian, loadrs=0
395 movl r28=__kernel_syscall_via_break
397 mov r23=ar.bspstore // save ar.bspstore (12 cyc)
398 mov r31=pr // save pr (2 cyc)
399 mov r20=r1 // save caller's gp in r20
401 mov r2=r16 // copy current task addr to addl-addressable register
403 mov r19=b6 // save b6 (2 cyc)
405 mov psr.l=r9 // slam the door (17 cyc to srlz.i)
406 or r29=r8,r29 // construct cr.ipsr value to save
407 addl r22=IA64_RBS_OFFSET,r2 // compute base of RBS
409 mov.m r24=ar.rnat // read ar.rnat (5 cyc lat)
410 lfetch.fault.excl.nt1 [r22]
411 adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r2
413 // ensure previous insn group is issued before we stall for srlz.i:
415 srlz.i // ensure new psr.l has been established
416 /////////////////////////////////////////////////////////////////////////////
417 ////////// from this point on, execution is not interruptible anymore
418 /////////////////////////////////////////////////////////////////////////////
419 addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2 // compute base of memory stack
420 cmp.ne pKStk,pUStk=r0,r0 // set pKStk <- 0, pUStk <- 1
422 st1 [r16]=r0 // clear current->thread.on_ustack flag
423 mov ar.bspstore=r22 // switch to kernel RBS
424 mov b6=r18 // copy syscall entry-point to b6 (7 cyc)
425 add r3=TI_FLAGS+IA64_TASK_SIZE,r2
427 ld4 r3=[r3] // r2 = current_thread_info()->flags
428 mov r18=ar.bsp // save (kernel) ar.bsp (12 cyc)
429 mov ar.rsc=0x3 // set eager mode, pl 0, little-endian, loadrs=0
430 br.call.sptk.many b7=ia64_syscall_setup
433 movl r2=ia64_ret_from_syscall
435 mov rp=r2 // set the real return addr
436 tbit.z p8,p0=r3,TIF_SYSCALL_TRACE
438 (p8) br.call.sptk.many b6=b6 // ignore this return addr
439 br.cond.sptk ia64_trace_syscall
440 END(fsys_bubble_down)
444 .globl fsyscall_table
446 data8 fsys_bubble_down
448 data8 fsys_ni_syscall
449 data8 0 // exit // 1025
454 data8 0 // creat // 1030
459 data8 0 // fchdir // 1035
464 data8 0 // lseek // 1040
465 data8 fsys_getpid // getpid
466 data8 fsys_getppid // getppid
469 data8 0 // setuid // 1045
474 data8 0 // sync // 1050
479 data8 0 // mkdir // 1055
484 data8 0 // brk // 1060
489 data8 0 // ioctl // 1065
494 data8 0 // dup2 // 1070
499 data8 0 // getresgid // 1075
504 data8 0 // setpgid // 1080
507 data8 0 // sethostname
509 data8 0 // getrlimit // 1085
511 data8 fsys_gettimeofday // gettimeofday
512 data8 0 // settimeofday
514 data8 0 // poll // 1090
519 data8 0 // swapoff // 1095
524 data8 0 // fchown // 1100
525 data8 0 // getpriority
526 data8 0 // setpriority
529 data8 0 // gettid // 1105
534 data8 0 // msgsnd // 1110
539 data8 0 // shmdt // 1115
549 data8 0 // remap_file_pages // 1125
553 data8 0 // setdomainname
554 data8 0 // newuname // 1130
557 data8 0 // init_module
558 data8 0 // delete_module
564 data8 0 // personality // 1140
565 data8 0 // afs_syscall
569 data8 0 // flock // 1145
574 data8 0 // sysctl // 1150
579 data8 0 // mprotect // 1155
583 data8 0 // munlockall
584 data8 0 // sched_getparam // 1160
585 data8 0 // sched_setparam
586 data8 0 // sched_getscheduler
587 data8 0 // sched_setscheduler
588 data8 0 // sched_yield
589 data8 0 // sched_get_priority_max // 1165
590 data8 0 // sched_get_priority_min
591 data8 0 // sched_rr_get_interval
593 data8 0 // nfsservctl
594 data8 0 // prctl // 1170
595 data8 0 // getpagesize
597 data8 0 // pciconfig_read
598 data8 0 // pciconfig_write
599 data8 0 // perfmonctl // 1175
600 data8 0 // sigaltstack
601 data8 0 // rt_sigaction
602 data8 0 // rt_sigpending
603 data8 0 // rt_sigprocmask
604 data8 0 // rt_sigqueueinfo // 1180
605 data8 0 // rt_sigreturn
606 data8 0 // rt_sigsuspend
607 data8 0 // rt_sigtimedwait
609 data8 0 // capget // 1185
614 data8 0 // socket // 1190
619 data8 0 // getsockname // 1195
620 data8 0 // getpeername
621 data8 0 // socketpair
624 data8 0 // recv // 1200
627 data8 0 // setsockopt
628 data8 0 // getsockopt
629 data8 0 // sendmsg // 1205
631 data8 0 // pivot_root
634 data8 0 // newstat // 1210
638 data8 0 // getdents64
639 data8 0 // getunwind // 1215
644 data8 0 // getxattr // 1220
648 data8 0 // llistxattr
649 data8 0 // flistxattr // 1225
650 data8 0 // removexattr
651 data8 0 // lremovexattr
652 data8 0 // fremovexattr
654 data8 0 // futex // 1230
655 data8 0 // sched_setaffinity
656 data8 0 // sched_getaffinity
657 data8 fsys_set_tid_address // set_tid_address
659 data8 0 // unused // 1235
660 data8 0 // exit_group
661 data8 0 // lookup_dcookie
663 data8 0 // io_destroy
664 data8 0 // io_getevents // 1240
667 data8 0 // epoll_create
669 data8 0 // epoll_wait // 1245
670 data8 0 // restart_syscall
671 data8 0 // semtimedop
672 data8 0 // timer_create
673 data8 0 // timer_settime
674 data8 0 // timer_gettime // 1250
675 data8 0 // timer_getoverrun
676 data8 0 // timer_delete
677 data8 0 // clock_settime
678 data8 0 // clock_gettime
679 data8 0 // clock_getres // 1255
680 data8 0 // clock_nanosleep
705 .org fsyscall_table + 8*NR_syscalls // guard against failures to increase NR_syscalls