/* FILE: arch/i386/kernel/entry.S */ 1 /* 2 * linux/arch/i386/entry.S 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 */ 6 7 /* 8 * entry.S contains the system-call and fault low-level 9 * handling routines. This also contains the 10 * timer-interrupt handler, as well as all interrupts and 11 * faults that can result in a task-switch. 12 * 13 * NOTE: This code handles signal-recognition, which 14 * happens every time after a timer-interrupt and after 15 * each system call. 16 * 17 * I changed all the .align's to 4 (16 byte alignment), 18 * as that's faster on a 486. 19 * 20 * Stack layout in 'ret_from_system_call': 21 * ptrace needs to have all regs on the stack. 22 * if the order here is changed, it needs to be 23 * updated in fork.c:copy_process, 24 * signal.c:do_signal, ptrace.c and ptrace.h 25 * 26 * 0(%esp) - %ebx 27 * 4(%esp) - %ecx 28 * 8(%esp) - %edx 29 * C(%esp) - %esi 30 * 10(%esp) - %edi 31 * 14(%esp) - %ebp 32 * 18(%esp) - %eax 33 * 1C(%esp) - %ds 34 * 20(%esp) - %es 35 * 24(%esp) - orig_eax 36 * 28(%esp) - %eip 37 * 2C(%esp) - %cs 38 * 30(%esp) - %eflags 39 * 34(%esp) - %oldesp 40 * 38(%esp) - %oldss 41 * 42 * "current" is in register %ebx during any slow entries. 43 */ 44 45 #include 46 #include 47 #include 48 #define ASSEMBLY 49 #include 50 51 EBX = 0x00 52 ECX = 0x04 53 EDX = 0x08 54 ESI = 0x0C 55 EDI = 0x10 56 EBP = 0x14 57 EAX = 0x18 58 DS = 0x1C 59 ES = 0x20 60 ORIG_EAX = 0x24 61 EIP = 0x28 62 CS = 0x2C 63 EFLAGS = 0x30 64 OLDESP = 0x34 65 OLDSS = 0x38 66 67 CF_MASK = 0x00000001 68 IF_MASK = 0x00000200 69 NT_MASK = 0x00004000 70 VM_MASK = 0x00020000 71 72 /* 73 * these are offsets into the task-struct. 74 */ 75 state = 0 76 flags = 4 77 sigpending = 8 78 addr_limit = 12 79 exec_domain = 16 80 need_resched = 20 81 82 ENOSYS = 38 83 84 85 #define SAVE_ALL \ 86 cld; \ 87 pushl %es; \ 88 pushl %ds; \ 89 pushl %eax; \ 90 pushl %ebp; \ 91 pushl %edi; \ 92 pushl %esi; \ 93 pushl %edx; \ 94 pushl %ecx; \ 95 pushl %ebx; \ 96 movl $(__KERNEL_DS),%edx; \ 97 movl %dx,%ds; \ 98 movl %dx,%es; 99 100 #define RESTORE_ALL \ 101 popl %ebx; \ 102 popl %ecx; \ 103 popl %edx; \ 104 popl %esi; \ 105 popl %edi; \ 106 popl %ebp; \ 107 popl %eax; \ 108 1: popl %ds; \ 109 2: popl %es; \ 110 addl $4,%esp; \ 111 3: iret; \ 112 .section .fixup,"ax"; \ 113 4: movl $0,(%esp); \ 114 jmp 1b; \ 115 5: movl $0,(%esp); \ 116 jmp 2b; \ 117 6: pushl %ss; \ 118 popl %ds; \ 119 pushl %ss; \ 120 popl %es; \ 121 pushl $11; \ 122 call do_exit; \ 123 .previous; \ 124 .section __ex_table,"a"; \ 125 .align 4; \ 126 .long 1b,4b; \ 127 .long 2b,5b; \ 128 .long 3b,6b; \ 129 .previous 130 131 #define GET_CURRENT(reg) \ 132 movl %esp, reg; \ 133 andl $-8192, reg; 134 135 ENTRY(lcall7) 136 pushfl # We get a different stack layout with call 137 pushl %eax # gates, which has to be cleaned up later.. 138 SAVE_ALL 139 movl EIP(%esp),%eax # this is eflags, not eip.. 140 movl CS(%esp),%edx # this is eip.. 141 movl EFLAGS(%esp),%ecx # and this is cs.. 142 movl %eax,EFLAGS(%esp) # 143 movl %edx,EIP(%esp) # move to their "normal" places 144 movl %ecx,CS(%esp) # 145 movl %esp,%ebx 146 pushl %ebx 147 andl $-8192,%ebx # GET_CURRENT 148 movl exec_domain(%ebx),%edx # Get the execution domain 149 movl 4(%edx),%edx # Get lcall7 handler for domain 150 call *%edx 151 popl %eax 152 jmp ret_from_sys_call 153 154 155 ALIGN 156 .globl ret_from_fork 157 ret_from_fork: 158 #ifdef __SMP__ 159 call SYMBOL_NAME(schedule_tail) 160 #endif /* __SMP__ */ 161 GET_CURRENT(%ebx) 162 jmp ret_from_sys_call 163 164 /* 165 * Return to user mode is not as complex as all this 166 * looks, but we want the default path for a system call 167 * return to go as quickly as possible which is why some 168 * of this is less clear than it otherwise should be. 169 */ 170 171 ENTRY(system_call) 172 pushl %eax # save orig_eax 173 SAVE_ALL 174 GET_CURRENT(%ebx) 175 cmpl $(NR_syscalls),%eax 176 jae badsys 177 testb $0x20,flags(%ebx) # PF_TRACESYS 178 jne tracesys 179 call *SYMBOL_NAME(sys_call_table)(,%eax,4) 180 movl %eax,EAX(%esp) # save the return value 181 ALIGN 182 .globl ret_from_sys_call 183 .globl ret_from_intr 184 ret_from_sys_call: 185 movl SYMBOL_NAME(bh_mask),%eax 186 andl SYMBOL_NAME(bh_active),%eax 187 jne handle_bottom_half 188 ret_with_reschedule: 189 cmpl $0,need_resched(%ebx) 190 jne reschedule 191 cmpl $0,sigpending(%ebx) 192 jne signal_return 193 restore_all: 194 RESTORE_ALL 195 196 ALIGN 197 signal_return: 198 sti # we can get here from an interrupt handler 199 testl $(VM_MASK),EFLAGS(%esp) 200 movl %esp,%eax 201 jne v86_signal_return 202 xorl %edx,%edx 203 call SYMBOL_NAME(do_signal) 204 jmp restore_all 205 206 ALIGN 207 v86_signal_return: 208 call SYMBOL_NAME(save_v86_state) 209 movl %eax,%esp 210 xorl %edx,%edx 211 call SYMBOL_NAME(do_signal) 212 jmp restore_all 213 214 ALIGN 215 tracesys: 216 movl $-ENOSYS,EAX(%esp) 217 call SYMBOL_NAME(syscall_trace) 218 movl ORIG_EAX(%esp),%eax 219 call *SYMBOL_NAME(sys_call_table)(,%eax,4) 220 movl %eax,EAX(%esp) # save the return value 221 call SYMBOL_NAME(syscall_trace) 222 jmp ret_from_sys_call 223 badsys: 224 movl $-ENOSYS,EAX(%esp) 225 jmp ret_from_sys_call 226 227 ALIGN 228 ret_from_exception: 229 movl SYMBOL_NAME(bh_mask),%eax 230 andl SYMBOL_NAME(bh_active),%eax 231 jne handle_bottom_half 232 ALIGN 233 ret_from_intr: 234 GET_CURRENT(%ebx) 235 movl EFLAGS(%esp),%eax # mix EFLAGS and CS 236 movb CS(%esp),%al 237 testl $(VM_MASK | 3),%eax # rtn to VM86 mode|non-super? 238 jne ret_with_reschedule 239 jmp restore_all 240 241 ALIGN 242 handle_bottom_half: 243 call SYMBOL_NAME(do_bottom_half) 244 jmp ret_from_intr 245 246 ALIGN 247 reschedule: 248 call SYMBOL_NAME(schedule) # test 249 jmp ret_from_sys_call 250 251 ENTRY(divide_error) 252 pushl $0 # no error code 253 pushl $ SYMBOL_NAME(do_divide_error) 254 ALIGN 255 error_code: 256 pushl %ds 257 pushl %eax 258 xorl %eax,%eax 259 pushl %ebp 260 pushl %edi 261 pushl %esi 262 pushl %edx 263 decl %eax # eax = -1 264 pushl %ecx 265 pushl %ebx 266 cld 267 movl %es,%cx 268 xchgl %eax, ORIG_EAX(%esp) # orig_eax (get error code.) 269 movl %esp,%edx 270 xchgl %ecx, ES(%esp) # get the addr and save es. 271 pushl %eax # push the error code 272 pushl %edx 273 movl $(__KERNEL_DS),%edx 274 movl %dx,%ds 275 movl %dx,%es 276 GET_CURRENT(%ebx) 277 call *%ecx 278 addl $8,%esp 279 jmp ret_from_exception 280 281 ENTRY(coprocessor_error) 282 pushl $0 283 pushl $ SYMBOL_NAME(do_coprocessor_error) 284 jmp error_code 285 286 ENTRY(device_not_available) 287 pushl $-1 # mark this as an int 288 SAVE_ALL 289 GET_CURRENT(%ebx) 290 pushl $ret_from_exception 291 movl %cr0,%eax 292 testl $0x4,%eax # EM (math emulation bit) 293 je SYMBOL_NAME(math_state_restore) 294 pushl $0 # temp storage for ORIG_EIP 295 call SYMBOL_NAME(math_emulate) 296 addl $4,%esp 297 ret 298 299 ENTRY(debug) 300 pushl $0 301 pushl $ SYMBOL_NAME(do_debug) 302 jmp error_code 303 304 ENTRY(nmi) 305 pushl $0 306 pushl $ SYMBOL_NAME(do_nmi) 307 jmp error_code 308 309 ENTRY(int3) 310 pushl $0 311 pushl $ SYMBOL_NAME(do_int3) 312 jmp error_code 313 314 ENTRY(overflow) 315 pushl $0 316 pushl $ SYMBOL_NAME(do_overflow) 317 jmp error_code 318 319 ENTRY(bounds) 320 pushl $0 321 pushl $ SYMBOL_NAME(do_bounds) 322 jmp error_code 323 324 ENTRY(invalid_op) 325 pushl $0 326 pushl $ SYMBOL_NAME(do_invalid_op) 327 jmp error_code 328 329 ENTRY(coprocessor_segment_overrun) 330 pushl $0 331 pushl $ SYMBOL_NAME(do_coprocessor_segment_overrun) 332 jmp error_code 333 334 ENTRY(reserved) 335 pushl $0 336 pushl $ SYMBOL_NAME(do_reserved) 337 jmp error_code 338 339 ENTRY(double_fault) 340 pushl $ SYMBOL_NAME(do_double_fault) 341 jmp error_code 342 343 ENTRY(invalid_TSS) 344 pushl $ SYMBOL_NAME(do_invalid_TSS) 345 jmp error_code 346 347 ENTRY(segment_not_present) 348 pushl $ SYMBOL_NAME(do_segment_not_present) 349 jmp error_code 350 351 ENTRY(stack_segment) 352 pushl $ SYMBOL_NAME(do_stack_segment) 353 jmp error_code 354 355 ENTRY(general_protection) 356 pushl $ SYMBOL_NAME(do_general_protection) 357 jmp error_code 358 359 ENTRY(alignment_check) 360 pushl $ SYMBOL_NAME(do_alignment_check) 361 jmp error_code 362 363 ENTRY(page_fault) 364 pushl $ SYMBOL_NAME(do_page_fault) 365 jmp error_code 366 367 ENTRY(spurious_interrupt_bug) 368 pushl $0 369 pushl $ SYMBOL_NAME(do_spurious_interrupt_bug) 370 jmp error_code 371 372 .data 373 ENTRY(sys_call_table) 374 .long SYMBOL_NAME(sys_ni_syscall) /* 0 */ 375 .long SYMBOL_NAME(sys_exit) 376 .long SYMBOL_NAME(sys_fork) 377 .long SYMBOL_NAME(sys_read) 378 .long SYMBOL_NAME(sys_write) 379 .long SYMBOL_NAME(sys_open) /* 5 */ 380 .long SYMBOL_NAME(sys_close) 381 .long SYMBOL_NAME(sys_waitpid) 382 .long SYMBOL_NAME(sys_creat) 383 .long SYMBOL_NAME(sys_link) 384 .long SYMBOL_NAME(sys_unlink) /* 10 */ 385 .long SYMBOL_NAME(sys_execve) 386 .long SYMBOL_NAME(sys_chdir) 387 .long SYMBOL_NAME(sys_time) 388 .long SYMBOL_NAME(sys_mknod) 389 .long SYMBOL_NAME(sys_chmod) /* 15 */ 390 .long SYMBOL_NAME(sys_lchown) 391 .long SYMBOL_NAME(sys_ni_syscall) /*old break holder*/ 392 .long SYMBOL_NAME(sys_stat) 393 .long SYMBOL_NAME(sys_lseek) 394 .long SYMBOL_NAME(sys_getpid) /* 20 */ 395 .long SYMBOL_NAME(sys_mount) 396 .long SYMBOL_NAME(sys_oldumount) 397 .long SYMBOL_NAME(sys_setuid) 398 .long SYMBOL_NAME(sys_getuid) 399 .long SYMBOL_NAME(sys_stime) /* 25 */ 400 .long SYMBOL_NAME(sys_ptrace) 401 .long SYMBOL_NAME(sys_alarm) 402 .long SYMBOL_NAME(sys_fstat) 403 .long SYMBOL_NAME(sys_pause) 404 .long SYMBOL_NAME(sys_utime) /* 30 */ 405 .long SYMBOL_NAME(sys_ni_syscall) /* old stty holder */ 406 .long SYMBOL_NAME(sys_ni_syscall) /* old gtty holder */ 407 .long SYMBOL_NAME(sys_access) 408 .long SYMBOL_NAME(sys_nice) /*next: old ftime holder*/ 409 .long SYMBOL_NAME(sys_ni_syscall) /* 35 */ 410 .long SYMBOL_NAME(sys_sync) 411 .long SYMBOL_NAME(sys_kill) 412 .long SYMBOL_NAME(sys_rename) 413 .long SYMBOL_NAME(sys_mkdir) 414 .long SYMBOL_NAME(sys_rmdir) /* 40 */ 415 .long SYMBOL_NAME(sys_dup) 416 .long SYMBOL_NAME(sys_pipe) 417 .long SYMBOL_NAME(sys_times) 418 .long SYMBOL_NAME(sys_ni_syscall) /* old prof holder */ 419 .long SYMBOL_NAME(sys_brk) /* 45 */ 420 .long SYMBOL_NAME(sys_setgid) 421 .long SYMBOL_NAME(sys_getgid) 422 .long SYMBOL_NAME(sys_signal) 423 .long SYMBOL_NAME(sys_geteuid) 424 .long SYMBOL_NAME(sys_getegid) /* 50 */ 425 .long SYMBOL_NAME(sys_acct) 426 .long SYMBOL_NAME(sys_umount) /*recyc never used phys*/ 427 .long SYMBOL_NAME(sys_ni_syscall) /* old lock holder */ 428 .long SYMBOL_NAME(sys_ioctl) 429 .long SYMBOL_NAME(sys_fcntl) /* 55 */ 430 .long SYMBOL_NAME(sys_ni_syscall) /* old mpx holder */ 431 .long SYMBOL_NAME(sys_setpgid) 432 .long SYMBOL_NAME(sys_ni_syscall) /*old ulimit holder*/ 433 .long SYMBOL_NAME(sys_olduname) 434 .long SYMBOL_NAME(sys_umask) /* 60 */ 435 .long SYMBOL_NAME(sys_chroot) 436 .long SYMBOL_NAME(sys_ustat) 437 .long SYMBOL_NAME(sys_dup2) 438 .long SYMBOL_NAME(sys_getppid) 439 .long SYMBOL_NAME(sys_getpgrp) /* 65 */ 440 .long SYMBOL_NAME(sys_setsid) 441 .long SYMBOL_NAME(sys_sigaction) 442 .long SYMBOL_NAME(sys_sgetmask) 443 .long SYMBOL_NAME(sys_ssetmask) 444 .long SYMBOL_NAME(sys_setreuid) /* 70 */ 445 .long SYMBOL_NAME(sys_setregid) 446 .long SYMBOL_NAME(sys_sigsuspend) 447 .long SYMBOL_NAME(sys_sigpending) 448 .long SYMBOL_NAME(sys_sethostname) 449 .long SYMBOL_NAME(sys_setrlimit) /* 75 */ 450 .long SYMBOL_NAME(sys_getrlimit) 451 .long SYMBOL_NAME(sys_getrusage) 452 .long SYMBOL_NAME(sys_gettimeofday) 453 .long SYMBOL_NAME(sys_settimeofday) 454 .long SYMBOL_NAME(sys_getgroups) /* 80 */ 455 .long SYMBOL_NAME(sys_setgroups) 456 .long SYMBOL_NAME(old_select) 457 .long SYMBOL_NAME(sys_symlink) 458 .long SYMBOL_NAME(sys_lstat) 459 .long SYMBOL_NAME(sys_readlink) /* 85 */ 460 .long SYMBOL_NAME(sys_uselib) 461 .long SYMBOL_NAME(sys_swapon) 462 .long SYMBOL_NAME(sys_reboot) 463 .long SYMBOL_NAME(old_readdir) 464 .long SYMBOL_NAME(old_mmap) /* 90 */ 465 .long SYMBOL_NAME(sys_munmap) 466 .long SYMBOL_NAME(sys_truncate) 467 .long SYMBOL_NAME(sys_ftruncate) 468 .long SYMBOL_NAME(sys_fchmod) 469 .long SYMBOL_NAME(sys_fchown) /* 95 */ 470 .long SYMBOL_NAME(sys_getpriority) 471 .long SYMBOL_NAME(sys_setpriority) 472 .long SYMBOL_NAME(sys_ni_syscall) /*old profil holder*/ 473 .long SYMBOL_NAME(sys_statfs) 474 .long SYMBOL_NAME(sys_fstatfs) /* 100 */ 475 .long SYMBOL_NAME(sys_ioperm) 476 .long SYMBOL_NAME(sys_socketcall) 477 .long SYMBOL_NAME(sys_syslog) 478 .long SYMBOL_NAME(sys_setitimer) 479 .long SYMBOL_NAME(sys_getitimer) /* 105 */ 480 .long SYMBOL_NAME(sys_newstat) 481 .long SYMBOL_NAME(sys_newlstat) 482 .long SYMBOL_NAME(sys_newfstat) 483 .long SYMBOL_NAME(sys_uname) 484 .long SYMBOL_NAME(sys_iopl) /* 110 */ 485 .long SYMBOL_NAME(sys_vhangup) 486 .long SYMBOL_NAME(sys_idle) 487 .long SYMBOL_NAME(sys_vm86old) 488 .long SYMBOL_NAME(sys_wait4) 489 .long SYMBOL_NAME(sys_swapoff) /* 115 */ 490 .long SYMBOL_NAME(sys_sysinfo) 491 .long SYMBOL_NAME(sys_ipc) 492 .long SYMBOL_NAME(sys_fsync) 493 .long SYMBOL_NAME(sys_sigreturn) 494 .long SYMBOL_NAME(sys_clone) /* 120 */ 495 .long SYMBOL_NAME(sys_setdomainname) 496 .long SYMBOL_NAME(sys_newuname) 497 .long SYMBOL_NAME(sys_modify_ldt) 498 .long SYMBOL_NAME(sys_adjtimex) 499 .long SYMBOL_NAME(sys_mprotect) /* 125 */ 500 .long SYMBOL_NAME(sys_sigprocmask) 501 .long SYMBOL_NAME(sys_create_module) 502 .long SYMBOL_NAME(sys_init_module) 503 .long SYMBOL_NAME(sys_delete_module) 504 .long SYMBOL_NAME(sys_get_kernel_syms) /* 130 */ 505 .long SYMBOL_NAME(sys_quotactl) 506 .long SYMBOL_NAME(sys_getpgid) 507 .long SYMBOL_NAME(sys_fchdir) 508 .long SYMBOL_NAME(sys_bdflush) 509 .long SYMBOL_NAME(sys_sysfs) /* 135 */ 510 .long SYMBOL_NAME(sys_personality) 511 .long SYMBOL_NAME(sys_ni_syscall) /* for afs_syscall */ 512 .long SYMBOL_NAME(sys_setfsuid) 513 .long SYMBOL_NAME(sys_setfsgid) 514 .long SYMBOL_NAME(sys_llseek) /* 140 */ 515 .long SYMBOL_NAME(sys_getdents) 516 .long SYMBOL_NAME(sys_select) 517 .long SYMBOL_NAME(sys_flock) 518 .long SYMBOL_NAME(sys_msync) 519 .long SYMBOL_NAME(sys_readv) /* 145 */ 520 .long SYMBOL_NAME(sys_writev) 521 .long SYMBOL_NAME(sys_getsid) 522 .long SYMBOL_NAME(sys_fdatasync) 523 .long SYMBOL_NAME(sys_sysctl) 524 .long SYMBOL_NAME(sys_mlock) /* 150 */ 525 .long SYMBOL_NAME(sys_munlock) 526 .long SYMBOL_NAME(sys_mlockall) 527 .long SYMBOL_NAME(sys_munlockall) 528 .long SYMBOL_NAME(sys_sched_setparam) 529 .long SYMBOL_NAME(sys_sched_getparam) /* 155 */ 530 .long SYMBOL_NAME(sys_sched_setscheduler) 531 .long SYMBOL_NAME(sys_sched_getscheduler) 532 .long SYMBOL_NAME(sys_sched_yield) 533 .long SYMBOL_NAME(sys_sched_get_priority_max) 534 .long SYMBOL_NAME(sys_sched_get_priority_min) /* 160 */ 535 .long SYMBOL_NAME(sys_sched_rr_get_interval) 536 .long SYMBOL_NAME(sys_nanosleep) 537 .long SYMBOL_NAME(sys_mremap) 538 .long SYMBOL_NAME(sys_setresuid) 539 .long SYMBOL_NAME(sys_getresuid) /* 165 */ 540 .long SYMBOL_NAME(sys_vm86) 541 .long SYMBOL_NAME(sys_query_module) 542 .long SYMBOL_NAME(sys_poll) 543 .long SYMBOL_NAME(sys_nfsservctl) 544 .long SYMBOL_NAME(sys_setresgid) /* 170 */ 545 .long SYMBOL_NAME(sys_getresgid) 546 .long SYMBOL_NAME(sys_prctl) 547 .long SYMBOL_NAME(sys_rt_sigreturn) 548 .long SYMBOL_NAME(sys_rt_sigaction) 549 .long SYMBOL_NAME(sys_rt_sigprocmask) /* 175 */ 550 .long SYMBOL_NAME(sys_rt_sigpending) 551 .long SYMBOL_NAME(sys_rt_sigtimedwait) 552 .long SYMBOL_NAME(sys_rt_sigqueueinfo) 553 .long SYMBOL_NAME(sys_rt_sigsuspend) 554 .long SYMBOL_NAME(sys_pread) /* 180 */ 555 .long SYMBOL_NAME(sys_pwrite) 556 .long SYMBOL_NAME(sys_chown) 557 .long SYMBOL_NAME(sys_getcwd) 558 .long SYMBOL_NAME(sys_capget) 559 .long SYMBOL_NAME(sys_capset) /* 185 */ 560 .long SYMBOL_NAME(sys_sigaltstack) 561 .long SYMBOL_NAME(sys_sendfile) 562 .long SYMBOL_NAME(sys_ni_syscall) /* streams1 */ 563 .long SYMBOL_NAME(sys_ni_syscall) /* streams2 */ 564 .long SYMBOL_NAME(sys_vfork) /* 190 */ 565 566 /* 567 * NOTE!! This doesn't have to be exact - we just have 568 * to make sure we have _enough_ of the sys_ni_syscall 569 * entries. Don't panic if you notice that this hasn't 570 * been shrunk every time we add a new system call. 571 */ 572 .rept NR_syscalls-190 573 .long SYMBOL_NAME(sys_ni_syscall) 574 .endr /* FILE: arch/i386/kernel/init_task.c */ 575 #include 576 #include 577 578 #include 579 #include 580 #include 581 582 static struct vm_area_struct init_mmap = INIT_MMAP; 583 static struct fs_struct init_fs = INIT_FS; 584 static struct file * init_fd_array[NR_OPEN] = { NULL, }; 585 static struct files_struct init_files = INIT_FILES; 586 static struct signal_struct init_signals = INIT_SIGNALS; 587 struct mm_struct init_mm = INIT_MM; 588 589 /* Initial task structure. 590 * We need to make sure that this is 8192-byte aligned 591 * due to the way process stacks are handled. This is 592 * done by having a special "init_task" linker map 593 * entry.. */ 594 union task_union init_task_union 595 __attribute__((__section__(".data.init_task"))) = 596 { INIT_TASK }; 597 /* FILE: arch/i386/kernel/irq.c */ 598 /* 599 * linux/arch/i386/kernel/irq.c 600 * 601 * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar 602 * 603 * This file contains the code used by various IRQ 604 * handling routines: asking for different IRQ's should 605 * be done through these routines instead of just 606 * grabbing them. Thus setups with different IRQ numbers 607 * shouldn't result in any weird surprises, and 608 * installing new handlers should be easier. */ 609 610 /* IRQs are in fact implemented a bit like signal 611 * handlers for the kernel. Naturally it's not a 1:1 612 * relation, but there are similarities. */ 613 614 #include 615 #include 616 #include 617 #include 618 #include 619 #include 620 #include 621 #include 622 #include 623 #include 624 #include 625 #include 626 #include 627 #include 628 #include 629 630 #include 631 #include 632 #include 633 #include 634 #include 635 #include 636 #include 637 #include 638 639 #include "irq.h" 640 641 unsigned int local_bh_count[NR_CPUS]; 642 unsigned int local_irq_count[NR_CPUS]; 643 644 atomic_t nmi_counter; 645 646 /* Linux has a controller-independent x86 interrupt 647 * architecture. every controller has a 648 * 'controller-template', that is used by the main code 649 * to do the right thing. Each driver-visible interrupt 650 * source is transparently wired to the apropriate 651 * controller. Thus drivers need not be aware of the 652 * interrupt-controller. 653 * 654 * Various interrupt controllers we handle: 8259 PIC, SMP 655 * IO-APIC, PIIX4's internal 8259 PIC and SGI's Visual 656 * Workstation Cobalt (IO-)APIC. (IO-APICs assumed to be 657 * messaging to Pentium local-APICs) 658 * 659 * the code is designed to be easily extended with 660 * new/different interrupt controllers, without having to 661 * do assembly magic. */ 662 663 /* Micro-access to controllers is serialized over the 664 * whole system. We never hold this lock when we call the 665 * actual IRQ handler. */ 666 spinlock_t irq_controller_lock; 667 668 /* Dummy controller type for unused interrupts */ 669 static void do_none(unsigned int irq, 670 struct pt_regs * regs) 671 { 672 /* we are careful. While for ISA irqs it's common to 673 * happen outside of any driver (think autodetection), 674 * this is not at all nice for PCI interrupts. So we 675 * are stricter and print a warning when such spurious 676 * interrupts happen. Spurious interrupts can confuse 677 * other drivers if the PCI IRQ line is shared. 678 * 679 * Such spurious interrupts are either driver bugs, or 680 * sometimes hw (chipset) bugs. */ 681 printk("unexpected IRQ vector %d on CPU#%d!\n", 682 irq, smp_processor_id()); 683 684 #ifdef __SMP__ 685 /* [currently unexpected vectors happen only on SMP and 686 * APIC. if we want to have non-APIC and non-8259A 687 * controllers in the future with unexpected vectors, 688 * this ack should probably be made 689 * controller-specific.] */ 690 ack_APIC_irq(); 691 #endif 692 } 693 static void enable_none(unsigned int irq) { } 694 static void disable_none(unsigned int irq) { } 695 696 /* startup is the same as "enable", shutdown is same as 697 * "disable" */ 698 #define startup_none enable_none 699 #define shutdown_none disable_none 700 701 struct hw_interrupt_type no_irq_type = { 702 "none", 703 startup_none, 704 shutdown_none, 705 do_none, 706 enable_none, 707 disable_none 708 }; 709 710 /* This is the 'legacy' 8259A Programmable Interrupt 711 * Controller, present in the majority of PC/AT boxes. */ 712 713 static void do_8259A_IRQ(unsigned int irq, 714 struct pt_regs * regs); 715 static void enable_8259A_irq(unsigned int irq); 716 void disable_8259A_irq(unsigned int irq); 717 718 /* startup is the same as "enable", shutdown is same as 719 * "disable" */ 720 #define startup_8259A_irq enable_8259A_irq 721 #define shutdown_8259A_irq disable_8259A_irq 722 723 static struct hw_interrupt_type i8259A_irq_type = { 724 "XT-PIC", 725 startup_8259A_irq, 726 shutdown_8259A_irq, 727 do_8259A_IRQ, 728 enable_8259A_irq, 729 disable_8259A_irq 730 }; 731 732 /* Controller mappings for all interrupt sources: */ 733 irq_desc_t irq_desc[NR_IRQS] = { [0 ... NR_IRQS-1] = 734 { 0, &no_irq_type, } }; 735 736 737 /* 8259A PIC functions to handle ISA devices: */ 738 739 /* This contains the irq mask for both 8259A irq 740 * controllers, */ 741 static unsigned int cached_irq_mask = 0xffff; 742 743 #define __byte(x,y) (((unsigned char *)&(y))[x]) 744 #define __word(x,y) (((unsigned short *)&(y))[x]) 745 #define __long(x,y) (((unsigned int *)&(y))[x]) 746 747 #define cached_21 (__byte(0,cached_irq_mask)) 748 #define cached_A1 (__byte(1,cached_irq_mask)) 749 750 /* Not all IRQs can be routed through the IO-APIC, eg. on 751 * certain (older) boards the timer interrupt is not 752 * connected to any IO-APIC pin, it's fed to the CPU IRQ 753 * line directly. 754 * 755 * Any '1' bit in this mask means the IRQ is routed 756 * through the IO-APIC. this 'mixed mode' IRQ handling 757 * costs nothing because it's only used at IRQ setup 758 * time. */ 759 unsigned long io_apic_irqs = 0; 760 761 /* These have to be protected by the irq controller 762 * spinlock before being called. */ 763 void disable_8259A_irq(unsigned int irq) 764 { 765 unsigned int mask = 1 << irq; 766 cached_irq_mask |= mask; 767 if (irq & 8) { 768 outb(cached_A1,0xA1); 769 } else { 770 outb(cached_21,0x21); 771 } 772 } 773 774 static void enable_8259A_irq(unsigned int irq) 775 { 776 unsigned int mask = ~(1 << irq); 777 cached_irq_mask &= mask; 778 if (irq & 8) { 779 outb(cached_A1,0xA1); 780 } else { 781 outb(cached_21,0x21); 782 } 783 } 784 785 int i8259A_irq_pending(unsigned int irq) 786 { 787 unsigned int mask = 1<> 8)); 792 } 793 794 void make_8259A_irq(unsigned int irq) 795 { 796 disable_irq(irq); 797 __long(0,io_apic_irqs) &= ~(1<status & ~IRQ_REPLAY; 832 action = NULL; 833 if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) 834 action = desc->action; 835 desc->status = status | IRQ_INPROGRESS; 836 } 837 spin_unlock(&irq_controller_lock); 838 839 /* Exit early if we had no action or it was disabled */ 840 if (!action) 841 return; 842 843 handle_IRQ_event(irq, regs, action); 844 845 spin_lock(&irq_controller_lock); 846 { 847 unsigned int status = desc->status & ~IRQ_INPROGRESS; 848 desc->status = status; 849 if (!(status & IRQ_DISABLED)) 850 enable_8259A_irq(irq); 851 } 852 spin_unlock(&irq_controller_lock); 853 } 854 855 /* This builds up the IRQ handler stubs using some ugly 856 * macros in irq.h 857 * 858 * These macros create the low-level assembly IRQ 859 * routines that save register context and call do_IRQ(). 860 * do_IRQ() then does all the operations that are needed 861 * to keep the AT (or SMP IOAPIC) interrupt-controller 862 * happy. */ 863 864 BUILD_COMMON_IRQ() 865 866 #define BI(x,y) \ 867 BUILD_IRQ(##x##y) 868 869 #define BUILD_16_IRQS(x) \ 870 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ 871 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ 872 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ 873 BI(x,c) BI(x,d) BI(x,e) BI(x,f) 874 875 /* ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) 876 * interrupts: (these are usually mapped to vectors 877 * 0x20-0x30) */ 878 BUILD_16_IRQS(0x0) 879 880 #ifdef CONFIG_X86_IO_APIC 881 /* The IO-APIC gives us many more interrupt sources. Most 882 * of these are unused but an SMP system is supposed to 883 * have enough memory ... sometimes (mostly wrt. hw 884 * bugs) we get corrupted vectors all across the 885 * spectrum, so we really want to be prepared to get all 886 * of these. Plus, more powerful systems might have more 887 * than 64 IO-APIC registers. 888 * 889 * (these are usually mapped into the 0x30-0xff vector 890 * range) */ 891 BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) 892 BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) 893 BUILD_16_IRQS(0x7) BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) 894 BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) BUILD_16_IRQS(0xc) 895 BUILD_16_IRQS(0xd) 896 #endif 897 898 #undef BUILD_16_IRQS 899 #undef BI 900 901 902 #ifdef __SMP__ 903 /* The following vectors are part of the Linux 904 * architecture, there is no hardware IRQ pin equivalent 905 * for them, they are triggered through the ICC by us 906 * (IPIs) */ 907 BUILD_SMP_INTERRUPT(reschedule_interrupt) 908 BUILD_SMP_INTERRUPT(invalidate_interrupt) 909 BUILD_SMP_INTERRUPT(stop_cpu_interrupt) 910 BUILD_SMP_INTERRUPT(mtrr_interrupt) 911 BUILD_SMP_INTERRUPT(spurious_interrupt) 912 913 /* every pentium local APIC has two 'local interrupts', 914 * with a soft-definable vector attached to both 915 * interrupts, one of which is a timer interrupt, the 916 * other one is error counter overflow. Linux uses the 917 * local APIC timer interrupt to get a much simpler SMP 918 * time architecture: */ 919 BUILD_SMP_TIMER_INTERRUPT(apic_timer_interrupt) 920 921 #endif 922 923 #define IRQ(x,y) \ 924 IRQ##x##y##_interrupt 925 926 #define IRQLIST_16(x) \ 927 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ 928 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ 929 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ 930 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) 931 932 static void (*interrupt[NR_IRQS])(void) = { 933 IRQLIST_16(0x0), 934 935 #ifdef CONFIG_X86_IO_APIC 936 IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), 937 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), 938 IRQLIST_16(0x7), IRQLIST_16(0x8), IRQLIST_16(0x9), 939 IRQLIST_16(0xa), IRQLIST_16(0xb), IRQLIST_16(0xc), 940 IRQLIST_16(0xd) 941 #endif 942 }; 943 944 #undef IRQ 945 #undef IRQLIST_16 946 947 948 /* Special irq handlers. */ 949 950 void no_action(int cpl, void *dev_id, 951 struct pt_regs *regs) 952 {} 953 954 #ifndef CONFIG_VISWS 955 /* Note that on a 486, we don't want to do a SIGFPE on an 956 * irq13 as the irq is unreliable, and exception 16 works 957 * correctly (ie as explained in the intel 958 * literature). On a 386, you can't use exception 16 due 959 * to bad IBM design, so we have to rely on the less 960 * exact irq13. 961 * 962 * Careful.. Not only is IRQ13 unreliable, but it is also 963 * leads to races. IBM designers who came up with it 964 * should be shot. */ 965 static void math_error_irq(int cpl, void *dev_id, 966 struct pt_regs *regs) 967 { 968 outb(0,0xF0); 969 if (ignore_irq13 || !boot_cpu_data.hard_math) 970 return; 971 math_error(); 972 } 973 974 static struct irqaction irq13 = 975 { math_error_irq, 0, 0, "fpu", NULL, NULL }; 976 977 /* IRQ2 is cascade interrupt to second interrupt 978 * controller */ 979 static struct irqaction irq2 = 980 { no_action, 0, 0, "cascade", NULL, NULL}; 981 #endif 982 983 /* Generic, controller-independent functions: */ 984 985 int get_irq_list(char *buf) 986 { 987 int i, j; 988 struct irqaction * action; 989 char *p = buf; 990 991 p += sprintf(p, " "); 992 for (j=0; jtypename); 1010 p += sprintf(p, " %s", action->name); 1011 1012 for (action=action->next; action; 1013 action = action->next) { 1014 p += sprintf(p, ", %s", action->name); 1015 } 1016 *p++ = '\n'; 1017 } 1018 p += sprintf(p, "NMI: %10u\n", 1019 atomic_read(&nmi_counter)); 1020 #ifdef __SMP__ 1021 p += sprintf(p, "ERR: %10lu\n", ipi_count); 1022 #endif 1023 return p - buf; 1024 } 1025 1026 /* Global interrupt locks for SMP. Allow interrupts to 1027 * come in on any CPU, yet make cli/sti act globally to 1028 * protect critical regions.. */ 1029 #ifdef __SMP__ 1030 unsigned char global_irq_holder = NO_PROC_ID; 1031 unsigned volatile int global_irq_lock; 1032 atomic_t global_irq_count; 1033 1034 atomic_t global_bh_count; 1035 atomic_t global_bh_lock; 1036 1037 /* "global_cli()" is a special case, in that it can hold 1038 * the interrupts disabled for a longish time, and also 1039 * because we may be doing TLB invalidates when holding 1040 * the global IRQ lock for historical reasons. Thus we 1041 * may need to check SMP invalidate events specially by 1042 * hand here (but not in any normal spinlocks) */ 1043 static inline void check_smp_invalidate(int cpu) 1044 { 1045 if (test_bit(cpu, &smp_invalidate_needed)) { 1046 clear_bit(cpu, &smp_invalidate_needed); 1047 local_flush_tlb(); 1048 } 1049 } 1050 1051 static void show(char * str) 1052 { 1053 int i; 1054 unsigned long *stack; 1055 int cpu = smp_processor_id(); 1056 extern char *get_options(char *str, int *ints); 1057 1058 printk("\n%s, CPU %d:\n", str, cpu); 1059 printk("irq: %d [%d %d]\n", 1060 atomic_read(&global_irq_count), local_irq_count[0], 1061 local_irq_count[1]); 1062 printk("bh: %d [%d %d]\n", 1063 atomic_read(&global_bh_count), local_bh_count[0], 1064 local_bh_count[1]); 1065 stack = (unsigned long *) &stack; 1066 for (i = 40; i ; i--) { 1067 unsigned long x = *++stack; 1068 if (x > (unsigned long) &get_options && 1069 x < (unsigned long) &vsprintf) { 1070 printk("<[%08lx]> ", x); 1071 } 1072 } 1073 } 1074 1075 #define MAXCOUNT 100000000 1076 1077 static inline void wait_on_bh(void) 1078 { 1079 int count = MAXCOUNT; 1080 do { 1081 if (!--count) { 1082 show("wait_on_bh"); 1083 count = ~0; 1084 } 1085 /* nothing .. wait for the other bh's to go away */ 1086 } while (atomic_read(&global_bh_count) != 0); 1087 } 1088 1089 /* I had a lockup scenario where a tight loop doing 1090 * spin_unlock()/spin_lock() on CPU#1 was racing with 1091 * spin_lock() on CPU#0. CPU#0 should have noticed 1092 * spin_unlock(), but apparently the spin_unlock() 1093 * information did not make it through to CPU#0 1094 * ... nasty, is this by design, do we have to limit 1095 * 'memory update oscillation frequency' artificially 1096 * like here? 1097 * 1098 * Such 'high frequency update' races can be avoided by 1099 * careful design, but some of our major constructs like 1100 * spinlocks use similar techniques, it would be nice to 1101 * clarify this issue. Set this define to 0 if you want 1102 * to check whether your system freezes. I suspect the 1103 * delay done by SYNC_OTHER_CORES() is in correlation 1104 * with 'snooping latency', but i thought that such 1105 * things are guaranteed by design, since we use the 1106 * 'LOCK' prefix. */ 1107 #define SUSPECTED_CPU_OR_CHIPSET_BUG_WORKAROUND 1 1108 1109 #if SUSPECTED_CPU_OR_CHIPSET_BUG_WORKAROUND 1110 # define SYNC_OTHER_CORES(x) udelay(x+1) 1111 #else 1112 /* We have to allow irqs to arrive between __sti and 1113 * __cli */ 1114 # define SYNC_OTHER_CORES(x) __asm__ __volatile__ ("nop") 1115 #endif 1116 1117 static inline void wait_on_irq(int cpu) 1118 { 1119 int count = MAXCOUNT; 1120 1121 for (;;) { 1122 1123 /* Wait until all interrupts are gone. Wait for 1124 * bottom half handlers unless we're already 1125 * executing in one.. */ 1126 if (!atomic_read(&global_irq_count)) { 1127 if (local_bh_count[cpu] || 1128 !atomic_read(&global_bh_count)) 1129 break; 1130 } 1131 1132 /* Duh, we have to loop. Release the lock to avoid 1133 * deadlocks */ 1134 clear_bit(0, &global_irq_lock); 1135 1136 for (;;) { 1137 if (!--count) { 1138 show("wait_on_irq"); 1139 count = ~0; 1140 } 1141 __sti(); 1142 SYNC_OTHER_CORES(cpu); 1143 __cli(); 1144 check_smp_invalidate(cpu); 1145 if (atomic_read(&global_irq_count)) 1146 continue; 1147 if (global_irq_lock) 1148 continue; 1149 if (!local_bh_count[cpu] && 1150 atomic_read(&global_bh_count)) 1151 continue; 1152 if (!test_and_set_bit(0,&global_irq_lock)) 1153 break; 1154 } 1155 } 1156 } 1157 1158 /* This is called when we want to synchronize with bottom 1159 * half handlers. We need to wait until no other CPU is 1160 * executing any bottom half handler. 1161 * 1162 * Don't wait if we're already running in an interrupt 1163 * context or are inside a bh handler. */ 1164 void synchronize_bh(void) 1165 { 1166 if (atomic_read(&global_bh_count) && !in_interrupt()) 1167 wait_on_bh(); 1168 } 1169 1170 /* This is called when we want to synchronize with 1171 * interrupts. We may for example tell a device to stop 1172 * sending interrupts: but to make sure there are no 1173 * interrupts that are executing on another CPU we need 1174 * to call this function. */ 1175 void synchronize_irq(void) 1176 { 1177 if (atomic_read(&global_irq_count)) { 1178 /* Stupid approach */ 1179 cli(); 1180 sti(); 1181 } 1182 } 1183 1184 static inline void get_irqlock(int cpu) 1185 { 1186 if (test_and_set_bit(0,&global_irq_lock)) { 1187 /* do we already hold the lock? */ 1188 if ((unsigned char) cpu == global_irq_holder) 1189 return; 1190 /* Uhhuh.. Somebody else got it. Wait.. */ 1191 do { 1192 do { 1193 check_smp_invalidate(cpu); 1194 } while (test_bit(0,&global_irq_lock)); 1195 } while (test_and_set_bit(0,&global_irq_lock)); 1196 } 1197 /* We also to make sure that nobody else is running in 1198 * an interrupt context. */ 1199 wait_on_irq(cpu); 1200 1201 /* Ok, finally.. */ 1202 global_irq_holder = cpu; 1203 } 1204 1205 #define EFLAGS_IF_SHIFT 9 1206 1207 /* A global "cli()" while in an interrupt context turns 1208 * into just a local cli(). Interrupts should use 1209 * spinlocks for the (very unlikely) case that they ever 1210 * want to protect against each other. 1211 * 1212 * If we already have local interrupts disabled, this 1213 * will not turn a local disable into a global one 1214 * (problems with spinlocks: this makes 1215 * save_flags+cli+sti usable inside a spinlock). */ 1216 void __global_cli(void) 1217 { 1218 unsigned int flags; 1219 1220 __save_flags(flags); 1221 if (flags & (1 << EFLAGS_IF_SHIFT)) { 1222 int cpu = smp_processor_id(); 1223 __cli(); 1224 if (!local_irq_count[cpu]) 1225 get_irqlock(cpu); 1226 } 1227 } 1228 1229 void __global_sti(void) 1230 { 1231 int cpu = smp_processor_id(); 1232 1233 if (!local_irq_count[cpu]) 1234 release_irqlock(cpu); 1235 __sti(); 1236 } 1237 1238 /* SMP flags value to restore to: 1239 * 0 - global cli 1240 * 1 - global sti 1241 * 2 - local cli 1242 * 3 - local sti */ 1243 unsigned long __global_save_flags(void) 1244 { 1245 int retval; 1246 int local_enabled; 1247 unsigned long flags; 1248 1249 __save_flags(flags); 1250 local_enabled = (flags >> EFLAGS_IF_SHIFT) & 1; 1251 /* default to local */ 1252 retval = 2 + local_enabled; 1253 1254 /*check for global flags if we're not in an interrupt*/ 1255 if (!local_irq_count[smp_processor_id()]) { 1256 if (local_enabled) 1257 retval = 1; 1258 if (global_irq_holder == 1259 (unsigned char) smp_processor_id()) 1260 retval = 0; 1261 } 1262 return retval; 1263 } 1264 1265 void __global_restore_flags(unsigned long flags) 1266 { 1267 switch (flags) { 1268 case 0: 1269 __global_cli(); 1270 break; 1271 case 1: 1272 __global_sti(); 1273 break; 1274 case 2: 1275 __cli(); 1276 break; 1277 case 3: 1278 __sti(); 1279 break; 1280 default: 1281 printk("global_restore_flags: %08lx (%08lx)\n", 1282 flags, (&flags)[-1]); 1283 } 1284 } 1285 1286 #endif 1287 1288 /* This should really return information about whether we 1289 * should do bottom half handling etc. Right now we end 1290 * up _always_ checking the bottom half, which is a waste 1291 * of time and is not what some drivers would prefer. */ 1292 int handle_IRQ_event(unsigned int irq, 1293 struct pt_regs * regs, struct irqaction * action) 1294 { 1295 int status; 1296 int cpu = smp_processor_id(); 1297 1298 irq_enter(cpu, irq); 1299 1300 status = 1; /* Force the "do bottom halves" bit */ 1301 1302 if (!(action->flags & SA_INTERRUPT)) 1303 __sti(); 1304 1305 do { 1306 status |= action->flags; 1307 action->handler(irq, action->dev_id, regs); 1308 action = action->next; 1309 } while (action); 1310 if (status & SA_SAMPLE_RANDOM) 1311 add_interrupt_randomness(irq); 1312 __cli(); 1313 1314 irq_exit(cpu, irq); 1315 1316 return status; 1317 } 1318 1319 /* Generic enable/disable code: this just calls down into 1320 * the PIC-specific version for the actual hardware 1321 * disable after having gotten the irq controller lock. 1322 */ 1323 void disable_irq(unsigned int irq) 1324 { 1325 unsigned long flags; 1326 1327 spin_lock_irqsave(&irq_controller_lock, flags); 1328 if (!irq_desc[irq].depth++) { 1329 irq_desc[irq].status |= IRQ_DISABLED; 1330 irq_desc[irq].handler->disable(irq); 1331 } 1332 spin_unlock_irqrestore(&irq_controller_lock, flags); 1333 1334 if (irq_desc[irq].status & IRQ_INPROGRESS) 1335 synchronize_irq(); 1336 } 1337 1338 void enable_irq(unsigned int irq) 1339 { 1340 unsigned long flags; 1341 1342 spin_lock_irqsave(&irq_controller_lock, flags); 1343 switch (irq_desc[irq].depth) { 1344 case 1: 1345 irq_desc[irq].status &= ~(IRQ_DISABLED | 1346 IRQ_INPROGRESS); 1347 irq_desc[irq].handler->enable(irq); 1348 /* fall throught */ 1349 default: 1350 irq_desc[irq].depth--; 1351 break; 1352 case 0: 1353 printk("enable_irq() unbalanced from %p\n", 1354 __builtin_return_address(0)); 1355 } 1356 spin_unlock_irqrestore(&irq_controller_lock, flags); 1357 } 1358 1359 /* do_IRQ handles all normal device IRQ's (the special 1360 * SMP cross-CPU interrupts have their own specific 1361 * handlers). */ 1362 asmlinkage void do_IRQ(struct pt_regs regs) 1363 { 1364 /* We ack quickly, we don't want the irq controller 1365 * thinking we're snobs just because some other CPU has 1366 * disabled global interrupts (we have already done the 1367 * INT_ACK cycles, it's too late to try to pretend to 1368 * the controller that we aren't taking the interrupt). 1369 * 1370 * 0 return value means that this irq is already being 1371 * handled by some other CPU. (or is disabled) */ 1372 int irq = regs.orig_eax & 0xff; /* subtle, see irq.h */ 1373 int cpu = smp_processor_id(); 1374 1375 kstat.irqs[cpu][irq]++; 1376 irq_desc[irq].handler->handle(irq, ®s); 1377 1378 /* This should be conditional: we should really get a 1379 * return code from the irq handler to tell us whether 1380 * the handler wants us to do software bottom half 1381 * handling or not.. */ 1382 if (1) { 1383 if (bh_active & bh_mask) 1384 do_bottom_half(); 1385 } 1386 } 1387 1388 int setup_x86_irq(unsigned int irq, 1389 struct irqaction * new) 1390 { 1391 int shared = 0; 1392 struct irqaction *old, **p; 1393 unsigned long flags; 1394 1395 /* Some drivers like serial.c use request_irq() 1396 * heavily, so we have to be careful not to interfere 1397 * with a running system. */ 1398 if (new->flags & SA_SAMPLE_RANDOM) { 1399 /* This function might sleep, we want to call it 1400 * first, outside of the atomic block. Yes, this 1401 * might clear the entropy pool if the wrong driver 1402 * is attempted to be loaded, without actually 1403 * installing a new handler, but is this really a 1404 * problem, only the sysadmin is able to do this. */ 1405 rand_initialize_irq(irq); 1406 } 1407 1408 /* The following block of code has to be executed 1409 * atomically */ 1410 spin_lock_irqsave(&irq_controller_lock,flags); 1411 p = &irq_desc[irq].action; 1412 if ((old = *p) != NULL) { 1413 /* Can't share interrupts unless both agree to */ 1414 if (!(old->flags & new->flags & SA_SHIRQ)) { 1415 spin_unlock_irqrestore(&irq_controller_lock,flags); 1416 return -EBUSY; 1417 } 1418 1419 /* add new interrupt at end of irq queue */ 1420 do { 1421 p = &old->next; 1422 old = *p; 1423 } while (old); 1424 shared = 1; 1425 } 1426 1427 *p = new; 1428 1429 if (!shared) { 1430 irq_desc[irq].depth = 0; 1431 irq_desc[irq].status &= ~(IRQ_DISABLED | 1432 IRQ_INPROGRESS); 1433 irq_desc[irq].handler->startup(irq); 1434 } 1435 spin_unlock_irqrestore(&irq_controller_lock,flags); 1436 return 0; 1437 } 1438 1439 int request_irq(unsigned int irq, 1440 void (*handler)(int, void *, struct pt_regs *), 1441 unsigned long irqflags, 1442 const char * devname, 1443 void *dev_id) 1444 { 1445 int retval; 1446 struct irqaction * action; 1447 1448 if (irq >= NR_IRQS) 1449 return -EINVAL; 1450 if (!handler) 1451 return -EINVAL; 1452 1453 action = (struct irqaction *) 1454 kmalloc(sizeof(struct irqaction), GFP_KERNEL); 1455 if (!action) 1456 return -ENOMEM; 1457 1458 action->handler = handler; 1459 action->flags = irqflags; 1460 action->mask = 0; 1461 action->name = devname; 1462 action->next = NULL; 1463 action->dev_id = dev_id; 1464 1465 retval = setup_x86_irq(irq, action); 1466 1467 if (retval) 1468 kfree(action); 1469 return retval; 1470 } 1471 1472 void free_irq(unsigned int irq, void *dev_id) 1473 { 1474 struct irqaction * action, **p; 1475 unsigned long flags; 1476 1477 if (irq >= NR_IRQS) 1478 return; 1479 1480 spin_lock_irqsave(&irq_controller_lock,flags); 1481 for (p = &irq_desc[irq].action; 1482 (action = *p) != NULL; p = &action->next) { 1483 if (action->dev_id != dev_id) 1484 continue; 1485 1486 /* Found it - now free it */ 1487 *p = action->next; 1488 kfree(action); 1489 if (!irq_desc[irq].action) { 1490 irq_desc[irq].status |= IRQ_DISABLED; 1491 irq_desc[irq].handler->shutdown(irq); 1492 } 1493 goto out; 1494 } 1495 printk("Trying to free free IRQ%d\n",irq); 1496 out: 1497 spin_unlock_irqrestore(&irq_controller_lock,flags); 1498 } 1499 1500 /* IRQ autodetection code.. 1501 * 1502 * This depends on the fact that any interrupt that comes 1503 * in on to an unassigned handler will get stuck with 1504 * "IRQ_INPROGRESS" asserted and the interrupt disabled. 1505 */ 1506 unsigned long probe_irq_on(void) 1507 { 1508 unsigned int i; 1509 unsigned long delay; 1510 1511 /* first, enable any unassigned irqs */ 1512 spin_lock_irq(&irq_controller_lock); 1513 for (i = NR_IRQS-1; i > 0; i--) { 1514 if (!irq_desc[i].action) { 1515 unsigned int status = 1516 irq_desc[i].status | IRQ_AUTODETECT; 1517 irq_desc[i].status = status & ~IRQ_INPROGRESS; 1518 irq_desc[i].handler->startup(i); 1519 } 1520 } 1521 spin_unlock_irq(&irq_controller_lock); 1522 1523 /* Wait for spurious interrupts to trigger */ 1524 for (delay = jiffies + HZ/10; 1525 time_after(delay, jiffies); ) 1526 /* about 100ms delay */ synchronize_irq(); 1527 1528 /* Now filter out any obviously spurious interrupts */ 1529 spin_lock_irq(&irq_controller_lock); 1530 for (i=0; ishutdown(i); 1540 } 1541 } 1542 spin_unlock_irq(&irq_controller_lock); 1543 1544 return 0x12345678; 1545 } 1546 1547 int probe_irq_off(unsigned long unused) 1548 { 1549 int i, irq_found, nr_irqs; 1550 1551 if (unused != 0x12345678) 1552 printk("Bad IRQ probe from %lx\n", (&unused)[-1]); 1553 1554 nr_irqs = 0; 1555 irq_found = 0; 1556 spin_lock_irq(&irq_controller_lock); 1557 for (i=0; ishutdown(i); 1570 } 1571 spin_unlock_irq(&irq_controller_lock); 1572 1573 if (nr_irqs > 1) 1574 irq_found = -irq_found; 1575 return irq_found; 1576 } 1577 1578 void init_ISA_irqs (void) 1579 { 1580 int i; 1581 1582 for (i = 0; i < NR_IRQS; i++) { 1583 irq_desc[i].status = IRQ_DISABLED; 1584 irq_desc[i].action = 0; 1585 irq_desc[i].depth = 0; 1586 1587 if (i < 16) { 1588 /* 16 old-style INTA-cycle interrupts: */ 1589 irq_desc[i].handler = &i8259A_irq_type; 1590 } else { 1591 /* 'high' PCI IRQs filled in on demand */ 1592 irq_desc[i].handler = &no_irq_type; 1593 } 1594 } 1595 } 1596 1597 __initfunc(void init_IRQ(void)) 1598 { 1599 int i; 1600 1601 #ifndef CONFIG_X86_VISWS_APIC 1602 init_ISA_irqs(); 1603 #else 1604 init_VISWS_APIC_irqs(); 1605 #endif 1606 /* Cover the whole vector space, no vector can escape 1607 * us. (some of these will be overridden and become 1608 * 'special' SMP interrupts) */ 1609 for (i = 0; i < NR_IRQS; i++) { 1610 int vector = FIRST_EXTERNAL_VECTOR + i; 1611 if (vector != SYSCALL_VECTOR) 1612 set_intr_gate(vector, interrupt[i]); 1613 } 1614 1615 #ifdef __SMP__ 1616 1617 /* IRQ0 must be given a fixed assignment and 1618 * initialized before init_IRQ_SMP. */ 1619 set_intr_gate(IRQ0_TRAP_VECTOR, interrupt[0]); 1620 1621 /* The reschedule interrupt is a CPU-to-CPU 1622 * reschedule-helper IPI, driven by wakeup. */ 1623 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 1624 1625 /* IPI for invalidation */ 1626 set_intr_gate(INVALIDATE_TLB_VECTOR, 1627 invalidate_interrupt); 1628 1629 /* IPI for CPU halt */ 1630 set_intr_gate(STOP_CPU_VECTOR, stop_cpu_interrupt); 1631 1632 /* self generated IPI for local APIC timer */ 1633 set_intr_gate(LOCAL_TIMER_VECTOR,apic_timer_interrupt); 1634 1635 /* IPI for MTRR control */ 1636 set_intr_gate(MTRR_CHANGE_VECTOR, mtrr_interrupt); 1637 1638 /* IPI vector for APIC spurious interrupts */ 1639 set_intr_gate(SPURIOUS_APIC_VECTOR,spurious_interrupt); 1640 #endif 1641 request_region(0x20,0x20,"pic1"); 1642 request_region(0xa0,0x20,"pic2"); 1643 1644 /* Set the clock to 100 Hz, we already have a valid 1645 * vector now: */ 1646 outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ 1647 outb_p(LATCH & 0xff , 0x40); /* LSB */ 1648 outb(LATCH >> 8 , 0x40); /* MSB */ 1649 1650 #ifndef CONFIG_VISWS 1651 setup_x86_irq(2, &irq2); 1652 setup_x86_irq(13, &irq13); 1653 #endif 1654 } 1655 1656 #ifdef CONFIG_X86_IO_APIC 1657 __initfunc(void init_IRQ_SMP(void)) 1658 { 1659 int i; 1660 for (i = 0; i < NR_IRQS ; i++) 1661 if (IO_APIC_VECTOR(i) > 0) 1662 set_intr_gate(IO_APIC_VECTOR(i), interrupt[i]); 1663 } 1664 #endif 1665 /* FILE: arch/i386/kernel/irq.h */ 1666 #ifndef __irq_h 1667 #define __irq_h 1668 1669 #include 1670 1671 /* Interrupt controller descriptor. This is all we need 1672 * to describe about the low-level hardware. */ 1673 struct hw_interrupt_type { 1674 const char * typename; 1675 void (*startup)(unsigned int irq); 1676 void (*shutdown)(unsigned int irq); 1677 void (*handle)(unsigned int irq,struct pt_regs * regs); 1678 void (*enable)(unsigned int irq); 1679 void (*disable)(unsigned int irq); 1680 }; 1681 1682 extern struct hw_interrupt_type no_irq_type; 1683 1684 /* IRQ line status. */ 1685 #define IRQ_INPROGRESS 1 /* active - do not enter! */ 1686 #define IRQ_DISABLED 2 /* disabled - do not enter! */ 1687 #define IRQ_PENDING 4 /* pending, replay on enable*/ 1688 #define IRQ_REPLAY 8 /* replayed but not acked */ 1689 #define IRQ_AUTODETECT 16 /* IRQ being autodetected */ 1690 1691 /* This is the "IRQ descriptor", which contains various 1692 * information about the irq, including what kind of 1693 * hardware handling it has, whether it is disabled etc 1694 * etc. 1695 * 1696 * Pad this out to 32 bytes for cache and indexing 1697 * reasons. */ 1698 typedef struct { 1699 /* IRQ status - IRQ_INPROGRESS, IRQ_DISABLED */ 1700 unsigned int status; 1701 /* handle/enable/disable functions */ 1702 struct hw_interrupt_type *handler; 1703 /* IRQ action list */ 1704 struct irqaction *action; 1705 /* Disable depth for nested irq disables */ 1706 unsigned int depth; 1707 } irq_desc_t; 1708 1709 /* IDT vectors usable for external interrupt sources 1710 * start at 0x20: */ 1711 #define FIRST_EXTERNAL_VECTOR 0x20 1712 1713 #define SYSCALL_VECTOR 0x80 1714 1715 /* Vectors 0x20-0x2f are used for ISA interrupts. */ 1716 1717 /* Special IRQ vectors used by the SMP architecture: 1718 * 1719 * (some of the following vectors are 'rare', they might 1720 * be merged into a single vector to save vector 1721 * space. TLB, reschedule and local APIC vectors are 1722 * performance-critical.) */ 1723 #define RESCHEDULE_VECTOR 0x30 1724 #define INVALIDATE_TLB_VECTOR 0x31 1725 #define STOP_CPU_VECTOR 0x40 1726 #define LOCAL_TIMER_VECTOR 0x41 1727 #define MTRR_CHANGE_VECTOR 0x50 1728 1729 /* First APIC vector available to drivers: (vectors 1730 * 0x51-0xfe) */ 1731 #define IRQ0_TRAP_VECTOR 0x51 1732 1733 /* This IRQ should never happen, but we print a message 1734 nevertheless. */ 1735 #define SPURIOUS_APIC_VECTOR 0xff 1736 1737 extern irq_desc_t irq_desc[NR_IRQS]; 1738 extern int irq_vector[NR_IRQS]; 1739 #define IO_APIC_VECTOR(irq) irq_vector[irq] 1740 1741 extern void init_IRQ_SMP(void); 1742 extern int handle_IRQ_event(unsigned int, 1743 struct pt_regs *, struct irqaction *); 1744 extern int setup_x86_irq(unsigned int, 1745 struct irqaction *); 1746 1747 /* Various low-level irq details needed by irq.c, 1748 * process.c, time.c, io_apic.c and smp.c 1749 * 1750 * Interrupt entry/exit code at both C and assembly level 1751 */ 1752 1753 extern void no_action(int cpl, void *dev_id, 1754 struct pt_regs *regs); 1755 extern void mask_irq(unsigned int irq); 1756 extern void unmask_irq(unsigned int irq); 1757 extern void disable_8259A_irq(unsigned int irq); 1758 extern int i8259A_irq_pending(unsigned int irq); 1759 extern void ack_APIC_irq(void); 1760 extern void FASTCALL(send_IPI_self(int vector)); 1761 extern void smp_send_mtrr(void); 1762 extern void init_VISWS_APIC_irqs(void); 1763 extern void setup_IO_APIC(void); 1764 extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, 1765 int fn); 1766 extern void make_8259A_irq(unsigned int irq); 1767 extern void send_IPI(int dest, int vector); 1768 extern void init_pic_mode(void); 1769 extern void print_IO_APIC(void); 1770 1771 extern unsigned long io_apic_irqs; 1772 1773 extern char _stext, _etext; 1774 1775 #define MAX_IRQ_SOURCES 128 1776 #define MAX_MP_BUSSES 32 1777 enum mp_bustype { 1778 MP_BUS_ISA, 1779 MP_BUS_PCI 1780 }; 1781 extern int mp_bus_id_to_type [MAX_MP_BUSSES]; 1782 extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES]; 1783 extern char ioapic_OEM_ID [16]; 1784 extern char ioapic_Product_ID [16]; 1785 1786 extern spinlock_t irq_controller_lock; 1787 1788 #ifdef __SMP__ 1789 1790 #include 1791 1792 static inline void irq_enter(int cpu, unsigned int irq) 1793 { 1794 hardirq_enter(cpu); 1795 while (test_bit(0,&global_irq_lock)) { 1796 /* nothing */; 1797 } 1798 } 1799 1800 static inline void irq_exit(int cpu, unsigned int irq) 1801 { 1802 hardirq_exit(cpu); 1803 } 1804 1805 #define IO_APIC_IRQ(x) (((x) >= 16) || \ 1806 ((1<<(x)) & io_apic_irqs)) 1807 1808 #else 1809 1810 #define irq_enter(cpu, irq) (++local_irq_count[cpu]) 1811 #define irq_exit(cpu, irq) (--local_irq_count[cpu]) 1812 1813 #define IO_APIC_IRQ(x) (0) 1814 1815 #endif 1816 1817 #define __STR(x) #x 1818 #define STR(x) __STR(x) 1819 1820 #define SAVE_ALL \ 1821 "cld\n\t" \ 1822 "pushl %es\n\t" \ 1823 "pushl %ds\n\t" \ 1824 "pushl %eax\n\t" \ 1825 "pushl %ebp\n\t" \ 1826 "pushl %edi\n\t" \ 1827 "pushl %esi\n\t" \ 1828 "pushl %edx\n\t" \ 1829 "pushl %ecx\n\t" \ 1830 "pushl %ebx\n\t" \ 1831 "movl $" STR(__KERNEL_DS) ",%edx\n\t" \ 1832 "movl %dx,%ds\n\t" \ 1833 "movl %dx,%es\n\t" 1834 1835 #define IRQ_NAME2(nr) nr##_interrupt(void) 1836 #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) 1837 1838 #define GET_CURRENT \ 1839 "movl %esp, %ebx\n\t" \ 1840 "andl $-8192, %ebx\n\t" 1841 1842 #ifdef __SMP__ 1843 1844 /* SMP has a few special interrupts for IPI messages */ 1845 1846 #define BUILD_SMP_INTERRUPT(x) \ 1847 asmlinkage void x(void); \ 1848 __asm__( \ 1849 "\n"__ALIGN_STR"\n" \ 1850 SYMBOL_NAME_STR(x) ":\n\t" \ 1851 "pushl $-1\n\t" \ 1852 SAVE_ALL \ 1853 "call "SYMBOL_NAME_STR(smp_##x)"\n\t" \ 1854 "jmp ret_from_intr\n"); 1855 1856 #define BUILD_SMP_TIMER_INTERRUPT(x) \ 1857 asmlinkage void x(struct pt_regs * regs); \ 1858 __asm__( \ 1859 "\n"__ALIGN_STR"\n" \ 1860 SYMBOL_NAME_STR(x) ":\n\t" \ 1861 "pushl $-1\n\t" \ 1862 SAVE_ALL \ 1863 "movl %esp,%eax\n\t" \ 1864 "pushl %eax\n\t" \ 1865 "call "SYMBOL_NAME_STR(smp_##x)"\n\t" \ 1866 "addl $4,%esp\n\t" \ 1867 "jmp ret_from_intr\n"); 1868 1869 #endif /* __SMP__ */ 1870 1871 #define BUILD_COMMON_IRQ() \ 1872 __asm__( \ 1873 "\n" __ALIGN_STR"\n" \ 1874 "common_interrupt:\n\t" \ 1875 SAVE_ALL \ 1876 "pushl $ret_from_intr\n\t" \ 1877 "jmp "SYMBOL_NAME_STR(do_IRQ)); 1878 1879 /* subtle. orig_eax is used by the signal code to 1880 * distinct between system calls and interrupted 'random 1881 * user-space'. Thus we have to put a negative value into 1882 * orig_eax here. (the problem is that both system calls 1883 * and IRQs want to have small integer numbers in 1884 * orig_eax, and the syscall code has won the 1885 * optimization conflict ;) */ 1886 #define BUILD_IRQ(nr) \ 1887 asmlinkage void IRQ_NAME(nr); \ 1888 __asm__( \ 1889 "\n"__ALIGN_STR"\n" \ 1890 SYMBOL_NAME_STR(IRQ) #nr "_interrupt:\n\t" \ 1891 "pushl $"#nr"-256\n\t" \ 1892 "jmp common_interrupt"); 1893 1894 /* x86 profiling function, SMP safe. We might want to do 1895 * this in assembly totally? */ 1896 static inline void x86_do_profile (unsigned long eip) 1897 { 1898 if (prof_buffer && current->pid) { 1899 eip -= (unsigned long) &_stext; 1900 eip >>= prof_shift; 1901 /* Don't ignore out-of-bounds EIP values silently, 1902 * put them into the last histogram slot, so if 1903 * present, they will show up as a sharp peak. */ 1904 if (eip > prof_len-1) 1905 eip = prof_len-1; 1906 atomic_inc((atomic_t *)&prof_buffer[eip]); 1907 } 1908 } 1909 1910 #endif /* FILE: arch/i386/kernel/process.c */ 1911 /* 1912 * linux/arch/i386/kernel/process.c 1913 * 1914 * Copyright (C) 1995 Linus Torvalds 1915 */ 1916 1917 /* This file handles the architecture-dependent parts of 1918 * process handling.. */ 1919 1920 #define __KERNEL_SYSCALLS__ 1921 #include 1922 1923 #include 1924 #include 1925 #include 1926 #include 1927 #include 1928 #include 1929 #include 1930 #include 1931 #include 1932 #include 1933 #include 1934 #include 1935 #include 1936 #include 1937 #include 1938 #include 1939 #include 1940 #include 1941 #include 1942 #include 1943 #if defined(CONFIG_APM) && defined(CONFIG_APM_POWER_OFF) 1944 #include 1945 #endif 1946 1947 #include 1948 #include 1949 #include 1950 #include 1951 #include 1952 #include 1953 #include 1954 #ifdef CONFIG_MATH_EMULATION 1955 #include 1956 #endif 1957 1958 #include "irq.h" 1959 1960 spinlock_t semaphore_wake_lock = SPIN_LOCK_UNLOCKED; 1961 1962 asmlinkage void ret_from_fork(void) 1963 __asm__("ret_from_fork"); 1964 1965 #ifdef CONFIG_APM 1966 extern int apm_do_idle(void); 1967 extern void apm_do_busy(void); 1968 #endif 1969 1970 static int hlt_counter=0; 1971 1972 #define HARD_IDLE_TIMEOUT (HZ / 3) 1973 1974 void disable_hlt(void) 1975 { 1976 hlt_counter++; 1977 } 1978 1979 void enable_hlt(void) 1980 { 1981 hlt_counter--; 1982 } 1983 1984 #ifndef __SMP__ 1985 1986 static void hard_idle(void) 1987 { 1988 while (!current->need_resched) { 1989 if (boot_cpu_data.hlt_works_ok && !hlt_counter) { 1990 #ifdef CONFIG_APM 1991 /* If the APM BIOS is not enabled, or there 1992 is an error calling the idle routine, we 1993 should hlt if possible. We need to check 1994 need_resched again because an interrupt 1995 may have occurred in apm_do_idle(). */ 1996 start_bh_atomic(); 1997 if (!apm_do_idle() && !current->need_resched) 1998 __asm__("hlt"); 1999 end_bh_atomic(); 2000 #else 2001 __asm__("hlt"); 2002 #endif 2003 } 2004 if (current->need_resched) 2005 break; 2006 schedule(); 2007 } 2008 #ifdef CONFIG_APM 2009 apm_do_busy(); 2010 #endif 2011 } 2012 2013 /* The idle loop on a uniprocessor i386.. */ 2014 static int cpu_idle(void *unused) 2015 { 2016 int work = 1; 2017 unsigned long start_idle = 0; 2018 2019 /* endless idle loop with no priority at all */ 2020 current->priority = 0; 2021 current->counter = -100; 2022 for (;;) { 2023 if (work) 2024 start_idle = jiffies; 2025 2026 if (jiffies - start_idle > HARD_IDLE_TIMEOUT) 2027 hard_idle(); 2028 else { 2029 if (boot_cpu_data.hlt_works_ok && 2030 !hlt_counter && !current->need_resched) 2031 __asm__("hlt"); 2032 } 2033 2034 work = current->need_resched; 2035 schedule(); 2036 check_pgt_cache(); 2037 } 2038 } 2039 2040 #else 2041 2042 /* This is being executed in task 0 'user space'. */ 2043 2044 int cpu_idle(void *unused) 2045 { 2046 /* endless idle loop with no priority at all */ 2047 current->priority = 0; 2048 current->counter = -100; 2049 while(1) { 2050 if (current_cpu_data.hlt_works_ok && !hlt_counter && 2051 !current->need_resched) 2052 __asm__("hlt"); 2053 /* although we are an idle CPU, we do not want to get 2054 * into the scheduler unnecessarily. */ 2055 if (current->need_resched) { 2056 schedule(); 2057 check_pgt_cache(); 2058 } 2059 } 2060 } 2061 2062 #endif 2063 2064 asmlinkage int sys_idle(void) 2065 { 2066 if (current->pid != 0) 2067 return -EPERM; 2068 cpu_idle(NULL); 2069 return 0; 2070 } 2071 2072 /* This routine reboots the machine by asking the 2073 * keyboard controller to pulse the reset-line low. We 2074 * try that for a while, and if it doesn't work, we do 2075 * some other stupid things. */ 2076 2077 static long no_idt[2] = {0, 0}; 2078 static int reboot_mode = 0; 2079 static int reboot_thru_bios = 0; 2080 2081 __initfunc(void reboot_setup(char *str, int *ints)) 2082 { 2083 while(1) { 2084 switch (*str) { 2085 case 'w': /* "warm" reboot (no memory testing etc) */ 2086 reboot_mode = 0x1234; 2087 break; 2088 case 'c': /* "cold" reboot (w/ memory testing etc) */ 2089 reboot_mode = 0x0; 2090 break; 2091 case 'b': /* "bios" reboot by jumping thru the BIOS*/ 2092 reboot_thru_bios = 1; 2093 break; 2094 case 'h': 2095 /* "hard" reboot by toggling RESET and/or crashing 2096 * the CPU */ 2097 reboot_thru_bios = 0; 2098 break; 2099 } 2100 if((str = strchr(str,',')) != NULL) 2101 str++; 2102 else 2103 break; 2104 } 2105 } 2106 2107 /* The following code and data reboots the machine by 2108 * switching to real mode and jumping to the BIOS reset 2109 * entry point, as if the CPU has really been reset. The 2110 * previous version asked the keyboard controller to 2111 * pulse the CPU reset line, which is more thorough, but 2112 * doesn't work with at least one type of 486 2113 * motherboard. It is easy to stop this code working; 2114 * hence the copious comments. */ 2115 static unsigned long long 2116 real_mode_gdt_entries [3] = 2117 { 2118 0x0000000000000000ULL, /* Null descriptor */ 2119 /* 16-bit real-mode 64k code at 0x00000000 */ 2120 0x00009a000000ffffULL, 2121 /* 16-bit real-mode 64k data at 0x00000100 */ 2122 0x000092000100ffffULL 2123 }; 2124 2125 static struct 2126 { 2127 unsigned short size __attribute__ ((packed)); 2128 unsigned long long * base __attribute__ ((packed)); 2129 } 2130 real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, 2131 real_mode_gdt_entries }, 2132 real_mode_idt = { 0x3ff, 0 }; 2133 2134 /* This is 16-bit protected mode code to disable paging 2135 and the cache, switch to real mode and jump to the 2136 BIOS reset code. 2137 2138 The instruction that switches to real mode by writing 2139 to CR0 must be followed immediately by a far jump 2140 instruction, which set CS to a valid value for real 2141 mode, and flushes the prefetch queue to avoid running 2142 instructions that have already been decoded in 2143 protected mode. 2144 2145 Clears all the flags except ET, especially PG 2146 (paging), PE (protected-mode enable) and TS (task 2147 switch for coprocessor state save). Flushes the TLB 2148 after paging has been disabled. Sets CD and NW, to 2149 disable the cache on a 486, and invalidates the cache. 2150 This is more like the state of a 486 after reset. I 2151 don't know if something else should be done for other 2152 chips. 2153 2154 More could be done here to set up the registers as if 2155 a CPU reset had occurred; hopefully real BIOSs don't 2156 assume much. */ 2157 2158 static unsigned char real_mode_switch [] = 2159 { 2160 0x66, 0x0f, 0x20, 0xc0, /*movl %cr0,%eax */ 2161 0x66, 0x83, 0xe0, 0x11, /*andl $0x00000011,%eax*/ 2162 /*orl $0x60000000,%eax*/ 2163 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, 2164 0x66, 0x0f, 0x22, 0xc0, /*movl %eax,%cr0 */ 2165 0x66, 0x0f, 0x22, 0xd8, /*movl %eax,%cr3 */ 2166 0x66, 0x0f, 0x20, 0xc3, /*movl %cr0,%ebx */ 2167 /*andl $0x60000000,%ebx*/ 2168 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, 2169 0x74, 0x02, /*jz f */ 2170 0x0f, 0x08, /*invd */ 2171 0x24, 0x10, /*f: andb $0x10,al*/ 2172 0x66, 0x0f, 0x22, 0xc0, /*movl %eax,%cr0*/ 2173 0xea, 0x00, 0x00, 0xff, 0xff /*ljmp $0xffff,$0x0000*/ 2174 }; 2175 2176 static inline void kb_wait(void) 2177 { 2178 int i; 2179 2180 for (i=0; i<0x10000; i++) 2181 if ((inb_p(0x64) & 0x02) == 0) 2182 break; 2183 } 2184 2185 void machine_restart(char * __unused) 2186 { 2187 #if __SMP__ 2188 /* turn off the IO-APIC, so we can do a clean reboot */ 2189 init_pic_mode(); 2190 #endif 2191 2192 if(!reboot_thru_bios) { 2193 /* rebooting needs to touch the page at abs addr 0 */ 2194 *((unsigned short *)__va(0x472)) = reboot_mode; 2195 for (;;) { 2196 int i; 2197 for (i=0; i<100; i++) { 2198 kb_wait(); 2199 udelay(50); 2200 outb(0xfe,0x64); /* pulse reset low */ 2201 udelay(50); 2202 } 2203 /* That didn't work - force a triple fault.. */ 2204 __asm__ __volatile__("lidt %0": :"m" (no_idt)); 2205 __asm__ __volatile__("int3"); 2206 } 2207 } 2208 2209 cli(); 2210 2211 /* Write zero to CMOS register number 0x0f, which the 2212 BIOS POST routine will recognize as telling it to do 2213 a proper reboot. (Well that's what this book in 2214 front of me says -- it may only apply to the Phoenix 2215 BIOS though, it's not clear). At the same time, 2216 disable NMIs by setting the top bit in the CMOS 2217 address register, as we're about to do peculiar 2218 things to the CPU. I'm not sure if `outb_p' is 2219 needed instead of just `outb'. Use it to be on the 2220 safe side. */ 2221 2222 outb_p (0x8f, 0x70); 2223 outb_p (0x00, 0x71); 2224 2225 /* Remap the kernel at virtual address zero, as well as 2226 offset zero from the kernel segment. This assumes 2227 the kernel segment starts at virtual address 2228 PAGE_OFFSET. */ 2229 2230 memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 2231 sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); 2232 2233 /* Make sure the first page is mapped to the start of 2234 physical memory. It is normally not mapped, to trap 2235 kernel NULL pointer dereferences. */ 2236 2237 pg0[0] = _PAGE_RW | _PAGE_PRESENT; 2238 2239 /* Use `swapper_pg_dir' as our page directory. We 2240 * bother with `SET_PAGE_DIR' because although might be 2241 * rebooting, but if we change the way we set root page 2242 * dir in the future, then we wont break a seldom used 2243 * feature ;) */ 2244 2245 SET_PAGE_DIR(current,swapper_pg_dir); 2246 2247 /* Write 0x1234 to absolute memory location 0x472. The 2248 BIOS reads this on booting to tell it to "Bypass 2249 memory test (also warm boot)". This seems like a 2250 fairly standard thing that gets set by REBOOT.COM 2251 programs, and the previous reset routine did this 2252 too. */ 2253 2254 *((unsigned short *)0x472) = reboot_mode; 2255 2256 /* For the switch to real mode, copy some code to low 2257 memory. It has to be in the first 64k because it is 2258 running in 16-bit mode, and it has to have the same 2259 physical and virtual address, because it turns off 2260 paging. Copy it near the end of the first page, out 2261 of the way of BIOS variables. */ 2262 2263 memcpy ((void *) (0x1000 - sizeof (real_mode_switch)), 2264 real_mode_switch, sizeof (real_mode_switch)); 2265 2266 /* Set up the IDT for real mode. */ 2267 2268 __asm__ __volatile__ 2269 ("lidt %0" : : "m" (real_mode_idt)); 2270 2271 /* Set up a GDT from which we can load segment 2272 descriptors for real mode. The GDT is not used in 2273 real mode; it is just needed here to prepare the 2274 descriptors. */ 2275 2276 __asm__ __volatile__ 2277 ("lgdt %0" : : "m" (real_mode_gdt)); 2278 2279 /* Load the data segment registers, and thus the 2280 descriptors ready for real mode. The base address 2281 of each segment is 0x100, 16 times the selector 2282 value being loaded here. This is so that the 2283 segment registers don't have to be reloaded after 2284 switching to real mode: the values are consistent 2285 for real mode operation already. */ 2286 2287 __asm__ __volatile__ ("movl $0x0010,%%eax\n" 2288 "\tmovl %%ax,%%ds\n" 2289 "\tmovl %%ax,%%es\n" 2290 "\tmovl %%ax,%%fs\n" 2291 "\tmovl %%ax,%%gs\n" 2292 "\tmovl %%ax,%%ss" : : : "eax"); 2293 2294 /* Jump to the 16-bit code that we copied earlier. It 2295 disables paging and the cache, switches to real 2296 mode, and jumps to the BIOS reset entry point. */ 2297 2298 __asm__ __volatile__ ("ljmp $0x0008,%0" 2299 : 2300 : "i" ((void *) (0x1000 - 2301 sizeof (real_mode_switch)))); 2302 } 2303 2304 void machine_halt(void) 2305 {} 2306 2307 void machine_power_off(void) 2308 { 2309 #if defined(CONFIG_APM) && defined(CONFIG_APM_POWER_OFF) 2310 apm_power_off(); 2311 #endif 2312 } 2313 2314 2315 void show_regs(struct pt_regs * regs) 2316 { 2317 long cr0 = 0L, cr2 = 0L, cr3 = 0L; 2318 2319 printk("\n"); 2320 printk("EIP: %04x:[<%08lx>]", 2321 0xffff & regs->xcs,regs->eip); 2322 if (regs->xcs & 3) 2323 printk(" ESP: %04x:%08lx", 2324 0xffff & regs->xss,regs->esp); 2325 printk(" EFLAGS: %08lx\n",regs->eflags); 2326 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", 2327 regs->eax,regs->ebx,regs->ecx,regs->edx); 2328 printk("ESI: %08lx EDI: %08lx EBP: %08lx", 2329 regs->esi, regs->edi, regs->ebp); 2330 printk(" DS: %04x ES: %04x\n", 2331 0xffff & regs->xds,0xffff & regs->xes); 2332 __asm__("movl %%cr0, %0": "=r" (cr0)); 2333 __asm__("movl %%cr2, %0": "=r" (cr2)); 2334 __asm__("movl %%cr3, %0": "=r" (cr3)); 2335 printk("CR0: %08lx CR2: %08lx CR3: %08lx\n", 2336 cr0, cr2, cr3); 2337 } 2338 2339 /* Allocation and freeing of basic task resources. 2340 * 2341 * NOTE! The task struct and the stack go together 2342 * 2343 * The task structure is a two-page thing, and as such 2344 * not reliable to allocate using the basic page alloc 2345 * functions. We have a small cache of structures for 2346 * when the allocations fail.. 2347 * 2348 * This extra buffer essentially acts to make for less 2349 * "jitter" in the allocations.. 2350 * 2351 * On SMP we don't do this right now because: 2352 * - we aren't holding any locks when called, and we 2353 * might as well just depend on the generic memory 2354 * management to do proper locking for us instead of 2355 * complicating it here. 2356 * - if you use SMP you have a beefy enough machine that 2357 * this shouldn't matter.. */ 2358 #ifndef __SMP__ 2359 #define EXTRA_TASK_STRUCT 16 2360 static struct task_struct * 2361 task_struct_stack[EXTRA_TASK_STRUCT]; 2362 static int task_struct_stack_ptr = -1; 2363 #endif 2364 2365 struct task_struct * alloc_task_struct(void) 2366 { 2367 #ifndef EXTRA_TASK_STRUCT 2368 return (struct task_struct *) 2369 __get_free_pages(GFP_KERNEL,1); 2370 #else 2371 int index; 2372 struct task_struct *ret; 2373 2374 index = task_struct_stack_ptr; 2375 if (index >= EXTRA_TASK_STRUCT/2) 2376 goto use_cache; 2377 ret = (struct task_struct *) 2378 __get_free_pages(GFP_KERNEL,1); 2379 if (!ret) { 2380 index = task_struct_stack_ptr; 2381 if (index >= 0) { 2382 use_cache: 2383 ret = task_struct_stack[index]; 2384 task_struct_stack_ptr = index-1; 2385 } 2386 } 2387 return ret; 2388 #endif 2389 } 2390 2391 void free_task_struct(struct task_struct *p) 2392 { 2393 #ifdef EXTRA_TASK_STRUCT 2394 int index = task_struct_stack_ptr+1; 2395 2396 if (index < EXTRA_TASK_STRUCT) { 2397 task_struct_stack[index] = p; 2398 task_struct_stack_ptr = index; 2399 } else 2400 #endif 2401 free_pages((unsigned long) p, 1); 2402 } 2403 2404 void release_segments(struct mm_struct *mm) 2405 { 2406 if (mm->segments) { 2407 void * ldt = mm->segments; 2408 mm->segments = NULL; 2409 vfree(ldt); 2410 } 2411 } 2412 2413 void forget_segments(void) 2414 { 2415 /* forget local segments */ 2416 __asm__ __volatile__("movl %w0,%%fs ; movl %w0,%%gs" 2417 : /* no outputs */ 2418 : "r" (0)); 2419 2420 /* Get the LDT entry from init_task. */ 2421 current->tss.ldt = _LDT(0); 2422 load_ldt(0); 2423 } 2424 2425 /* Create a kernel thread */ 2426 int kernel_thread(int (*fn)(void *), void * arg, 2427 unsigned long flags) 2428 { 2429 long retval, d0; 2430 2431 __asm__ __volatile__( 2432 "movl %%esp,%%esi\n\t" 2433 "int $0x80\n\t" /* Linux/i386 system call */ 2434 "cmpl %%esp,%%esi\n\t" /* child or parent? */ 2435 "je 1f\n\t" /* parent - jump */ 2436 /* Load the argument into eax, and push it. That 2437 * way, it does not matter whether the called 2438 * function is compiled with -mregparm or not. */ 2439 "movl %4,%%eax\n\t" 2440 "pushl %%eax\n\t" 2441 "call *%5\n\t" /* call fn */ 2442 "movl %3,%0\n\t" /* exit */ 2443 "int $0x80\n" 2444 "1:\t" 2445 :"=&a" (retval), "=&S" (d0) 2446 :"0" (__NR_clone), "i" (__NR_exit), 2447 "r" (arg), "r" (fn), 2448 "b" (flags | CLONE_VM) 2449 : "memory"); 2450 return retval; 2451 } 2452 2453 /* Free current thread data structures etc.. */ 2454 void exit_thread(void) 2455 { 2456 /* nothing to do ... */ 2457 } 2458 2459 void flush_thread(void) 2460 { 2461 int i; 2462 struct task_struct *tsk = current; 2463 2464 for (i=0 ; i<8 ; i++) 2465 tsk->tss.debugreg[i] = 0; 2466 2467 /* Forget coprocessor state.. */ 2468 clear_fpu(tsk); 2469 tsk->used_math = 0; 2470 } 2471 2472 void release_thread(struct task_struct *dead_task) 2473 { 2474 } 2475 2476 /* If new_mm is NULL, we're being called to set up the 2477 * LDT descriptor for a clone task. Each clone must have 2478 * a separate entry in the GDT. */ 2479 void copy_segments(int nr, struct task_struct *p, 2480 struct mm_struct *new_mm) 2481 { 2482 struct mm_struct * old_mm = current->mm; 2483 void * old_ldt = old_mm->segments, * ldt = old_ldt; 2484 2485 /* default LDT - use the one from init_task */ 2486 p->tss.ldt = _LDT(0); 2487 if (old_ldt) { 2488 if (new_mm) { 2489 ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); 2490 new_mm->segments = ldt; 2491 if (!ldt) { 2492 printk(KERN_WARNING "ldt allocation failed\n"); 2493 return; 2494 } 2495 memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE); 2496 } 2497 p->tss.ldt = _LDT(nr); 2498 set_ldt_desc(nr, ldt, LDT_ENTRIES); 2499 return; 2500 } 2501 } 2502 2503 /* Save a segment. */ 2504 #define savesegment(seg,value) \ 2505 asm volatile("movl %%" #seg ",%0":"=m" \ 2506 (*(int *)&(value))) 2507 2508 int copy_thread(int nr, unsigned long clone_flags, 2509 unsigned long esp,struct task_struct * p, 2510 struct pt_regs * regs) 2511 { 2512 struct pt_regs * childregs; 2513 2514 childregs = ((struct pt_regs *) 2515 (2*PAGE_SIZE + (unsigned long) p)) - 1; 2516 *childregs = *regs; 2517 childregs->eax = 0; 2518 childregs->esp = esp; 2519 2520 p->tss.esp = (unsigned long) childregs; 2521 p->tss.esp0 = (unsigned long) (childregs+1); 2522 p->tss.ss0 = __KERNEL_DS; 2523 2524 p->tss.tr = _TSS(nr); 2525 set_tss_desc(nr,&(p->tss)); 2526 p->tss.eip = (unsigned long) ret_from_fork; 2527 2528 savesegment(fs,p->tss.fs); 2529 savesegment(gs,p->tss.gs); 2530 2531 /* a bitmap offset pointing outside of the TSS limit 2532 * causes a nicely controllable SIGSEGV. The first 2533 * sys_ioperm() call sets up the bitmap properly. */ 2534 p->tss.bitmap = sizeof(struct thread_struct); 2535 2536 unlazy_fpu(current); 2537 p->tss.i387 = current->tss.i387; 2538 2539 return 0; 2540 } 2541 2542 /* fill in the FPU structure for a core dump. */ 2543 int dump_fpu(struct pt_regs * regs, 2544 struct user_i387_struct * fpu) 2545 { 2546 int fpvalid; 2547 struct task_struct *tsk = current; 2548 2549 fpvalid = tsk->used_math; 2550 if (fpvalid) { 2551 unlazy_fpu(tsk); 2552 memcpy(fpu,&tsk->tss.i387.hard,sizeof(*fpu)); 2553 } 2554 2555 return fpvalid; 2556 } 2557 2558 /* fill in the user structure for a core dump.. */ 2559 void dump_thread(struct pt_regs * regs, 2560 struct user * dump) 2561 { 2562 int i; 2563 2564 /* changed the size calculations - should hopefully work 2565 better. lbt */ 2566 dump->magic = CMAGIC; 2567 dump->start_code = 0; 2568 dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); 2569 dump->u_tsize = 2570 ((unsigned long) current->mm->end_code) 2571 >> PAGE_SHIFT; 2572 dump->u_dsize = 2573 ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) 2574 >> PAGE_SHIFT; 2575 dump->u_dsize -= dump->u_tsize; 2576 dump->u_ssize = 0; 2577 for (i = 0; i < 8; i++) 2578 dump->u_debugreg[i] = current->tss.debugreg[i]; 2579 2580 if (dump->start_stack < TASK_SIZE) 2581 dump->u_ssize = 2582 ((unsigned long) (TASK_SIZE - dump->start_stack)) 2583 >> PAGE_SHIFT; 2584 2585 dump->regs.ebx = regs->ebx; 2586 dump->regs.ecx = regs->ecx; 2587 dump->regs.edx = regs->edx; 2588 dump->regs.esi = regs->esi; 2589 dump->regs.edi = regs->edi; 2590 dump->regs.ebp = regs->ebp; 2591 dump->regs.eax = regs->eax; 2592 dump->regs.ds = regs->xds; 2593 dump->regs.es = regs->xes; 2594 savesegment(fs,dump->regs.fs); 2595 savesegment(gs,dump->regs.gs); 2596 dump->regs.orig_eax = regs->orig_eax; 2597 dump->regs.eip = regs->eip; 2598 dump->regs.cs = regs->xcs; 2599 dump->regs.eflags = regs->eflags; 2600 dump->regs.esp = regs->esp; 2601 dump->regs.ss = regs->xss; 2602 2603 dump->u_fpvalid = dump_fpu (regs, &dump->i387); 2604 } 2605 2606 /* This special macro can be used to load a debugging 2607 * register */ 2608 #define loaddebug(tsk,register) \ 2609 __asm__("movl %0,%%db" #register \ 2610 : /* no output */ \ 2611 :"r" (tsk->tss.debugreg[register])) 2612 2613 2614 /* switch_to(x,yn) should switch tasks from x to y. 2615 * 2616 * We fsave/fwait so that an exception goes off at the 2617 * right time (as a call from the fsave or fwait in 2618 * effect) rather than to the wrong process. Lazy FP 2619 * saving no longer makes any sense with modern CPU's, 2620 * and this simplifies a lot of things (SMP and UP become 2621 * the same). 2622 * 2623 * NOTE! We used to use the x86 hardware context 2624 * switching. The reason for not using it any more 2625 * becomes apparent when you try to recover gracefully 2626 * from saved state that is no longer valid (stale 2627 * segment register values in particular). With the 2628 * hardware task-switch, there is no way to fix up bad 2629 * state in a reasonable manner. 2630 * 2631 * The fact that Intel documents the hardware 2632 * task-switching to be slow is a fairly red herring - 2633 * this code is not noticeably faster. However, there 2634 * _is_ some room for improvement here, so the 2635 * performance issues may eventually be a valid point. 2636 * More important, however, is the fact that this allows 2637 * us much more flexibility. */ 2638 void __switch_to(struct task_struct *prev, 2639 struct task_struct *next) 2640 { 2641 /* Save FPU and set TS if it wasn't set before.. */ 2642 unlazy_fpu(prev); 2643 2644 /* Reload TR, LDT and the page table pointers.. 2645 * 2646 * We need TR for the IO permission bitmask (and the 2647 * vm86 bitmasks in case we ever use enhanced v86 mode 2648 * properly). 2649 * 2650 * We may want to get rid of the TR register some day, 2651 * and copy the bitmaps around by hand. Oh, well. In 2652 * the meantime we have to clear the busy bit in the 2653 * TSS entry, ugh. */ 2654 gdt_table[next->tss.tr >> 3].b &= 0xfffffdff; 2655 asm volatile("ltr %0": :"g" 2656 (*(unsigned short *)&next->tss.tr)); 2657 2658 /* Save away %fs and %gs. No need to save %es and %ds, 2659 * as those are always kernel segments while inside the 2660 * kernel. */ 2661 asm volatile("movl %%fs,%0":"=m" 2662 (*(int *)&prev->tss.fs)); 2663 asm volatile("movl %%gs,%0":"=m" 2664 (*(int *)&prev->tss.gs)); 2665 2666 /* Re-load LDT if necessary */ 2667 if (next->mm->segments != prev->mm->segments) 2668 asm volatile("lldt %0": :"g" 2669 (*(unsigned short *)&next->tss.ldt)); 2670 2671 /* Re-load page tables */ 2672 { 2673 unsigned long new_cr3 = next->tss.cr3; 2674 if (new_cr3 != prev->tss.cr3) 2675 asm volatile("movl %0,%%cr3": :"r" (new_cr3)); 2676 } 2677 2678 /* Restore %fs and %gs. */ 2679 loadsegment(fs,next->tss.fs); 2680 loadsegment(gs,next->tss.gs); 2681 2682 /* Now maybe reload the debug registers */ 2683 if (next->tss.debugreg[7]){ 2684 loaddebug(next,0); 2685 loaddebug(next,1); 2686 loaddebug(next,2); 2687 loaddebug(next,3); 2688 loaddebug(next,6); 2689 loaddebug(next,7); 2690 } 2691 } 2692 2693 asmlinkage int sys_fork(struct pt_regs regs) 2694 { 2695 return do_fork(SIGCHLD, regs.esp, ®s); 2696 } 2697 2698 asmlinkage int sys_clone(struct pt_regs regs) 2699 { 2700 unsigned long clone_flags; 2701 unsigned long newsp; 2702 2703 clone_flags = regs.ebx; 2704 newsp = regs.ecx; 2705 if (!newsp) 2706 newsp = regs.esp; 2707 return do_fork(clone_flags, newsp, ®s); 2708 } 2709 2710 /* This is trivial, and on the face of it looks like it 2711 * could equally well be done in user mode. 2712 * 2713 * Not so, for quite unobvious reasons - register 2714 * pressure. In user mode vfork() cannot have a stack 2715 * frame, and if done by calling the "clone()" system 2716 * call directly, you do not have enough call-clobbered 2717 * registers to hold all the information you need. */ 2718 asmlinkage int sys_vfork(struct pt_regs regs) 2719 { 2720 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 2721 regs.esp, ®s); 2722 } 2723 2724 /* sys_execve() executes a new program. */ 2725 asmlinkage int sys_execve(struct pt_regs regs) 2726 { 2727 int error; 2728 char * filename; 2729 2730 lock_kernel(); 2731 filename = getname((char *) regs.ebx); 2732 error = PTR_ERR(filename); 2733 if (IS_ERR(filename)) 2734 goto out; 2735 error = do_execve(filename, (char **) regs.ecx, 2736 (char **) regs.edx, ®s); 2737 if (error == 0) 2738 current->flags &= ~PF_DTRACE; 2739 putname(filename); 2740 out: 2741 unlock_kernel(); 2742 return error; 2743 } /* FILE: arch/i386/kernel/signal.c */ 2744 /* 2745 * linux/arch/i386/kernel/signal.c 2746 * 2747 * Copyright (C) 1991, 1992 Linus Torvalds 2748 * 1997-11-28 Modified for POSIX.1b signals by Richard 2749 * Henderson */ 2750 2751 #include 2752 2753 #include 2754 #include 2755 #include 2756 #include 2757 #include 2758 #include 2759 #include 2760 #include 2761 #include 2762 #include 2763 #include 2764 #include 2765 #include 2766 2767 #define DEBUG_SIG 0 2768 2769 #define _BLOCKABLE (~(sigmask(SIGKILL)|sigmask(SIGSTOP))) 2770 2771 asmlinkage int sys_wait4(pid_t pid, 2772 unsigned long *stat_addr, 2773 int options, unsigned long *ru); 2774 asmlinkage int FASTCALL(do_signal(struct pt_regs *regs, 2775 sigset_t *oldset)); 2776 2777 /* Atomically swap in the new signal mask, and wait for a 2778 * signal. */ 2779 asmlinkage int 2780 sys_sigsuspend(int history0, int history1, 2781 old_sigset_t mask) 2782 { 2783 struct pt_regs * regs = (struct pt_regs *) &history0; 2784 sigset_t saveset; 2785 2786 mask &= _BLOCKABLE; 2787 spin_lock_irq(¤t->sigmask_lock); 2788 saveset = current->blocked; 2789 siginitset(¤t->blocked, mask); 2790 recalc_sigpending(current); 2791 spin_unlock_irq(¤t->sigmask_lock); 2792 2793 regs->eax = -EINTR; 2794 while (1) { 2795 current->state = TASK_INTERRUPTIBLE; 2796 schedule(); 2797 if (do_signal(regs, &saveset)) 2798 return -EINTR; 2799 } 2800 } 2801 2802 asmlinkage int 2803 sys_rt_sigsuspend(sigset_t *unewset, size_t sigsetsize) 2804 { 2805 struct pt_regs * regs = (struct pt_regs *) &unewset; 2806 sigset_t saveset, newset; 2807 2808 /* XXX: Don't preclude handling different sized 2809 * sigset_t's. */ 2810 if (sigsetsize != sizeof(sigset_t)) 2811 return -EINVAL; 2812 2813 if (copy_from_user(&newset, unewset, sizeof(newset))) 2814 return -EFAULT; 2815 sigdelsetmask(&newset, ~_BLOCKABLE); 2816 2817 spin_lock_irq(¤t->sigmask_lock); 2818 saveset = current->blocked; 2819 current->blocked = newset; 2820 recalc_sigpending(current); 2821 spin_unlock_irq(¤t->sigmask_lock); 2822 2823 regs->eax = -EINTR; 2824 while (1) { 2825 current->state = TASK_INTERRUPTIBLE; 2826 schedule(); 2827 if (do_signal(regs, &saveset)) 2828 return -EINTR; 2829 } 2830 } 2831 2832 asmlinkage int 2833 sys_sigaction(int sig, const struct old_sigaction *act, 2834 struct old_sigaction *oact) 2835 { 2836 struct k_sigaction new_ka, old_ka; 2837 int ret; 2838 2839 if (act) { 2840 old_sigset_t mask; 2841 if (verify_area(VERIFY_READ, act, sizeof(*act)) || 2842 __get_user(new_ka.sa.sa_handler, 2843 &act->sa_handler) || 2844 __get_user(new_ka.sa.sa_restorer, 2845 &act->sa_restorer)) 2846 return -EFAULT; 2847 __get_user(new_ka.sa.sa_flags, &act->sa_flags); 2848 __get_user(mask, &act->sa_mask); 2849 siginitset(&new_ka.sa.sa_mask, mask); 2850 } 2851 2852 ret = do_sigaction(sig, act ? &new_ka : NULL, 2853 oact ? &old_ka : NULL); 2854 2855 if (!ret && oact) { 2856 if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) || 2857 __put_user(old_ka.sa.sa_handler, 2858 &oact->sa_handler) || 2859 __put_user(old_ka.sa.sa_restorer, 2860 &oact->sa_restorer)) 2861 return -EFAULT; 2862 __put_user(old_ka.sa.sa_flags, &oact->sa_flags); 2863 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); 2864 } 2865 2866 return ret; 2867 } 2868 2869 asmlinkage int 2870 sys_sigaltstack(const stack_t *uss, stack_t *uoss) 2871 { 2872 struct pt_regs *regs = (struct pt_regs *) &uss; 2873 return do_sigaltstack(uss, uoss, regs->esp); 2874 } 2875 2876 2877 /* Do a signal return; undo the signal stack. */ 2878 2879 struct sigframe 2880 { 2881 char *pretcode; 2882 int sig; 2883 struct sigcontext sc; 2884 struct _fpstate fpstate; 2885 unsigned long extramask[_NSIG_WORDS-1]; 2886 char retcode[8]; 2887 }; 2888 2889 struct rt_sigframe 2890 { 2891 char *pretcode; 2892 int sig; 2893 struct siginfo *pinfo; 2894 void *puc; 2895 struct siginfo info; 2896 struct ucontext uc; 2897 struct _fpstate fpstate; 2898 char retcode[8]; 2899 }; 2900 2901 2902 static inline int restore_i387_hard(struct _fpstate *buf) 2903 { 2904 struct task_struct *tsk = current; 2905 clear_fpu(tsk); 2906 return __copy_from_user(&tsk->tss.i387.hard, buf, 2907 sizeof(*buf)); 2908 } 2909 2910 static inline int restore_i387(struct _fpstate *buf) 2911 { 2912 int err; 2913 #ifndef CONFIG_MATH_EMULATION 2914 err = restore_i387_hard(buf); 2915 #else 2916 if (boot_cpu_data.hard_math) 2917 err = restore_i387_hard(buf); 2918 else 2919 err = restore_i387_soft(¤t->tss.i387.soft,buf); 2920 #endif 2921 current->used_math = 1; 2922 return err; 2923 } 2924 2925 static int 2926 restore_sigcontext(struct pt_regs *regs, 2927 struct sigcontext *sc, int *peax) 2928 { 2929 unsigned int err = 0; 2930 2931 #define COPY(x) err |= __get_user(regs->x, &sc->x) 2932 2933 #define COPY_SEG(seg) \ 2934 { unsigned short tmp; \ 2935 err |= __get_user(tmp, &sc->seg); \ 2936 regs->x##seg = tmp; } 2937 2938 #define COPY_SEG_STRICT(seg) \ 2939 { unsigned short tmp; \ 2940 err |= __get_user(tmp, &sc->seg); \ 2941 regs->x##seg = tmp|3; } 2942 2943 #define GET_SEG(seg) \ 2944 { unsigned short tmp; \ 2945 err |= __get_user(tmp, &sc->seg); \ 2946 loadsegment(seg,tmp); } 2947 2948 GET_SEG(gs); 2949 GET_SEG(fs); 2950 COPY_SEG(es); 2951 COPY_SEG(ds); 2952 COPY(edi); 2953 COPY(esi); 2954 COPY(ebp); 2955 COPY(esp); 2956 COPY(ebx); 2957 COPY(edx); 2958 COPY(ecx); 2959 COPY(eip); 2960 COPY_SEG_STRICT(cs); 2961 COPY_SEG_STRICT(ss); 2962 2963 { 2964 unsigned int tmpflags; 2965 err |= __get_user(tmpflags, &sc->eflags); 2966 regs->eflags = (regs->eflags & ~0x40DD5) | 2967 (tmpflags & 0x40DD5); 2968 regs->orig_eax = -1; /* disable syscall checks */ 2969 } 2970 2971 { 2972 struct _fpstate * buf; 2973 err |= __get_user(buf, &sc->fpstate); 2974 if (buf) { 2975 if (verify_area(VERIFY_READ, buf, sizeof(*buf))) 2976 goto badframe; 2977 err |= restore_i387(buf); 2978 } 2979 } 2980 2981 err |= __get_user(*peax, &sc->eax); 2982 return err; 2983 2984 badframe: 2985 return 1; 2986 } 2987 2988 asmlinkage int sys_sigreturn(unsigned long __unused) 2989 { 2990 struct pt_regs *regs = (struct pt_regs *) &__unused; 2991 struct sigframe *frame = 2992 (struct sigframe *)(regs->esp - 8); 2993 sigset_t set; 2994 int eax; 2995 2996 if (verify_area(VERIFY_READ, frame, sizeof(*frame))) 2997 goto badframe; 2998 if (__get_user(set.sig[0], &frame->sc.oldmask) 2999 || (_NSIG_WORDS > 1 3000 && __copy_from_user(&set.sig[1], &frame->extramask, 3001 sizeof(frame->extramask)))) 3002 goto badframe; 3003 3004 sigdelsetmask(&set, ~_BLOCKABLE); 3005 spin_lock_irq(¤t->sigmask_lock); 3006 current->blocked = set; 3007 recalc_sigpending(current); 3008 spin_unlock_irq(¤t->sigmask_lock); 3009 3010 if (restore_sigcontext(regs, &frame->sc, &eax)) 3011 goto badframe; 3012 return eax; 3013 3014 badframe: 3015 force_sig(SIGSEGV, current); 3016 return 0; 3017 } 3018 3019 asmlinkage int sys_rt_sigreturn(unsigned long __unused) 3020 { 3021 struct pt_regs *regs = (struct pt_regs *) &__unused; 3022 struct rt_sigframe *frame = 3023 (struct rt_sigframe *)(regs->esp - 4); 3024 sigset_t set; 3025 stack_t st; 3026 int eax; 3027 3028 if (verify_area(VERIFY_READ, frame, sizeof(*frame))) 3029 goto badframe; 3030 if (__copy_from_user(&set, &frame->uc.uc_sigmask, 3031 sizeof(set))) 3032 goto badframe; 3033 3034 sigdelsetmask(&set, ~_BLOCKABLE); 3035 spin_lock_irq(¤t->sigmask_lock); 3036 current->blocked = set; 3037 recalc_sigpending(current); 3038 spin_unlock_irq(¤t->sigmask_lock); 3039 3040 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, 3041 &eax)) 3042 goto badframe; 3043 3044 if (__copy_from_user(&st, &frame->uc.uc_stack, 3045 sizeof(st))) 3046 goto badframe; 3047 /* It is more difficult to avoid calling this function 3048 * than to call it and ignore errors. */ 3049 do_sigaltstack(&st, NULL, regs->esp); 3050 3051 return eax; 3052 3053 badframe: 3054 force_sig(SIGSEGV, current); 3055 return 0; 3056 } 3057 3058 /* Set up a signal frame. */ 3059 3060 static inline int save_i387_hard(struct _fpstate * buf) 3061 { 3062 struct task_struct *tsk = current; 3063 3064 unlazy_fpu(tsk); 3065 tsk->tss.i387.hard.status = tsk->tss.i387.hard.swd; 3066 if (__copy_to_user(buf, &tsk->tss.i387.hard, 3067 sizeof(*buf))) 3068 return -1; 3069 return 1; 3070 } 3071 3072 static int save_i387(struct _fpstate *buf) 3073 { 3074 if (!current->used_math) 3075 return 0; 3076 3077 /* This will cause a "finit" to be triggered by the 3078 * next attempted FPU operation by the 'current' 3079 * process. */ 3080 current->used_math = 0; 3081 3082 #ifndef CONFIG_MATH_EMULATION 3083 return save_i387_hard(buf); 3084 #else 3085 return boot_cpu_data.hard_math ? save_i387_hard(buf) 3086 : save_i387_soft(¤t->tss.i387.soft, buf); 3087 #endif 3088 } 3089 3090 static int 3091 setup_sigcontext(struct sigcontext *sc, 3092 struct _fpstate *fpstate, 3093 struct pt_regs *regs,unsigned long mask) 3094 { 3095 int tmp, err = 0; 3096 3097 tmp = 0; 3098 __asm__("movl %%gs,%w0" : "=r"(tmp): "0"(tmp)); 3099 err |= __put_user(tmp, (unsigned int *)&sc->gs); 3100 __asm__("movl %%fs,%w0" : "=r"(tmp): "0"(tmp)); 3101 err |= __put_user(tmp, (unsigned int *)&sc->fs); 3102 3103 err |= __put_user(regs->xes, (unsigned int *)&sc->es); 3104 err |= __put_user(regs->xds, (unsigned int *)&sc->ds); 3105 err |= __put_user(regs->edi, &sc->edi); 3106 err |= __put_user(regs->esi, &sc->esi); 3107 err |= __put_user(regs->ebp, &sc->ebp); 3108 err |= __put_user(regs->esp, &sc->esp); 3109 err |= __put_user(regs->ebx, &sc->ebx); 3110 err |= __put_user(regs->edx, &sc->edx); 3111 err |= __put_user(regs->ecx, &sc->ecx); 3112 err |= __put_user(regs->eax, &sc->eax); 3113 err |= __put_user(current->tss.trap_no, &sc->trapno); 3114 err |= __put_user(current->tss.error_code, &sc->err); 3115 err |= __put_user(regs->eip, &sc->eip); 3116 err |= __put_user(regs->xcs, (unsigned int *)&sc->cs); 3117 err |= __put_user(regs->eflags, &sc->eflags); 3118 err |= __put_user(regs->esp, &sc->esp_at_signal); 3119 err |= __put_user(regs->xss, (unsigned int *)&sc->ss); 3120 3121 tmp = save_i387(fpstate); 3122 if (tmp < 0) 3123 err = 1; 3124 else 3125 err |= __put_user(tmp ? fpstate : NULL,&sc->fpstate); 3126 3127 /* non-iBCS2 extensions.. */ 3128 err |= __put_user(mask, &sc->oldmask); 3129 err |= __put_user(current->tss.cr2, &sc->cr2); 3130 3131 return err; 3132 } 3133 3134 /* Determine which stack to use.. */ 3135 static inline void * 3136 get_sigframe(struct k_sigaction *ka, 3137 struct pt_regs * regs, size_t frame_size) 3138 { 3139 unsigned long esp; 3140 3141 /* Default to using normal stack */ 3142 esp = regs->esp; 3143 3144 /* This is the X/Open sanctioned signal stack 3145 * switching. */ 3146 if (ka->sa.sa_flags & SA_ONSTACK) { 3147 if (! on_sig_stack(esp)) 3148 esp = current->sas_ss_sp + current->sas_ss_size; 3149 } 3150 3151 /* This is the legacy signal stack switching. */ 3152 else if ((regs->xss & 0xffff) != __USER_DS && 3153 !(ka->sa.sa_flags & SA_RESTORER) && 3154 ka->sa.sa_restorer) { 3155 esp = (unsigned long) ka->sa.sa_restorer; 3156 } 3157 3158 return (void *)((esp - frame_size) & -8ul); 3159 } 3160 3161 static void setup_frame(int sig, struct k_sigaction *ka, 3162 sigset_t *set, struct pt_regs * regs) 3163 { 3164 struct sigframe *frame; 3165 int err = 0; 3166 3167 frame = get_sigframe(ka, regs, sizeof(*frame)); 3168 3169 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 3170 goto give_sigsegv; 3171 3172 err |= __put_user((current->exec_domain 3173 && current->exec_domain->signal_invmap 3174 && sig < 32 3175 ? current->exec_domain->signal_invmap[sig] 3176 : sig), 3177 &frame->sig); 3178 3179 err |= setup_sigcontext(&frame->sc, &frame->fpstate, 3180 regs, set->sig[0]); 3181 3182 if (_NSIG_WORDS > 1) { 3183 err |= __copy_to_user(frame->extramask, &set->sig[1], 3184 sizeof(frame->extramask)); 3185 } 3186 3187 /* Set up to return from userspace. If provided, use a 3188 * stub already in userspace. */ 3189 if (ka->sa.sa_flags & SA_RESTORER) { 3190 err |= __put_user(ka->sa.sa_restorer, 3191 &frame->pretcode); 3192 } else { 3193 err |= __put_user(frame->retcode, &frame->pretcode); 3194 /* This is popl %eax ; movl $,%eax ; int $0x80 */ 3195 err |= __put_user(0xb858, 3196 (short *)(frame->retcode+0)); 3197 err |= __put_user(__NR_sigreturn, 3198 (int *)(frame->retcode+2)); 3199 err |= __put_user(0x80cd, 3200 (short *)(frame->retcode+6)); 3201 } 3202 3203 if (err) 3204 goto give_sigsegv; 3205 3206 /* Set up registers for signal handler */ 3207 regs->esp = (unsigned long) frame; 3208 regs->eip = (unsigned long) ka->sa.sa_handler; 3209 3210 set_fs(USER_DS); 3211 regs->xds = __USER_DS; 3212 regs->xes = __USER_DS; 3213 regs->xss = __USER_DS; 3214 regs->xcs = __USER_CS; 3215 regs->eflags &= ~TF_MASK; 3216 3217 #if DEBUG_SIG 3218 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 3219 current->comm, current->pid, frame, regs->eip, 3220 frame->pretcode); 3221 #endif 3222 3223 return; 3224 3225 give_sigsegv: 3226 if (sig == SIGSEGV) 3227 ka->sa.sa_handler = SIG_DFL; 3228 force_sig(SIGSEGV, current); 3229 } 3230 3231 static void setup_rt_frame(int sig, 3232 struct k_sigaction *ka, siginfo_t *info, 3233 sigset_t *set, struct pt_regs * regs) 3234 { 3235 struct rt_sigframe *frame; 3236 int err = 0; 3237 3238 frame = get_sigframe(ka, regs, sizeof(*frame)); 3239 3240 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 3241 goto give_sigsegv; 3242 3243 err |= __put_user((current->exec_domain 3244 && current->exec_domain->signal_invmap 3245 && sig < 32 3246 ? current->exec_domain->signal_invmap[sig] 3247 : sig), 3248 &frame->sig); 3249 err |= __put_user(&frame->info, &frame->pinfo); 3250 err |= __put_user(&frame->uc, &frame->puc); 3251 err |= __copy_to_user(&frame->info,info,sizeof(*info)); 3252 3253 /* Create the ucontext. */ 3254 err |= __put_user(0, &frame->uc.uc_flags); 3255 err |= __put_user(0, &frame->uc.uc_link); 3256 err |= __put_user(current->sas_ss_sp, 3257 &frame->uc.uc_stack.ss_sp); 3258 err |= __put_user(sas_ss_flags(regs->esp), 3259 &frame->uc.uc_stack.ss_flags); 3260 err |= __put_user(current->sas_ss_size, 3261 &frame->uc.uc_stack.ss_size); 3262 err |= setup_sigcontext(&frame->uc.uc_mcontext, 3263 &frame->fpstate, 3264 regs, set->sig[0]); 3265 err |= __copy_to_user(&frame->uc.uc_sigmask, set, 3266 sizeof(*set)); 3267 3268 /* Set up to return from userspace. If provided, use a 3269 * stub already in userspace. */ 3270 if (ka->sa.sa_flags & SA_RESTORER) { 3271 err |= __put_user(ka->sa.sa_restorer, 3272 &frame->pretcode); 3273 } else { 3274 err |= __put_user(frame->retcode, &frame->pretcode); 3275 /* This is movl $,%eax ; int $0x80 */ 3276 err |= __put_user(0xb8, (char *)(frame->retcode+0)); 3277 err |= __put_user(__NR_rt_sigreturn, 3278 (int *)(frame->retcode+1)); 3279 err |= __put_user(0x80cd, 3280 (short *)(frame->retcode+5)); 3281 } 3282 3283 if (err) 3284 goto give_sigsegv; 3285 3286 /* Set up registers for signal handler */ 3287 regs->esp = (unsigned long) frame; 3288 regs->eip = (unsigned long) ka->sa.sa_handler; 3289 3290 set_fs(USER_DS); 3291 regs->xds = __USER_DS; 3292 regs->xes = __USER_DS; 3293 regs->xss = __USER_DS; 3294 regs->xcs = __USER_CS; 3295 regs->eflags &= ~TF_MASK; 3296 3297 #if DEBUG_SIG 3298 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 3299 current->comm, current->pid, frame, regs->eip, 3300 frame->pretcode); 3301 #endif 3302 3303 return; 3304 3305 give_sigsegv: 3306 if (sig == SIGSEGV) 3307 ka->sa.sa_handler = SIG_DFL; 3308 force_sig(SIGSEGV, current); 3309 } 3310 3311 /* OK, we're invoking a handler */ 3312 3313 static void 3314 handle_signal(unsigned long sig, struct k_sigaction *ka, 3315 siginfo_t *info, sigset_t *oldset, struct pt_regs * regs) 3316 { 3317 /* Are we from a system call? */ 3318 if (regs->orig_eax >= 0) { 3319 /* If so, check system call restarting.. */ 3320 switch (regs->eax) { 3321 case -ERESTARTNOHAND: 3322 regs->eax = -EINTR; 3323 break; 3324 3325 case -ERESTARTSYS: 3326 if (!(ka->sa.sa_flags & SA_RESTART)) { 3327 regs->eax = -EINTR; 3328 break; 3329 } 3330 /* fallthrough */ 3331 case -ERESTARTNOINTR: 3332 regs->eax = regs->orig_eax; 3333 regs->eip -= 2; 3334 } 3335 } 3336 3337 /* Set up the stack frame */ 3338 if (ka->sa.sa_flags & SA_SIGINFO) 3339 setup_rt_frame(sig, ka, info, oldset, regs); 3340 else 3341 setup_frame(sig, ka, oldset, regs); 3342 3343 if (ka->sa.sa_flags & SA_ONESHOT) 3344 ka->sa.sa_handler = SIG_DFL; 3345 3346 if (!(ka->sa.sa_flags & SA_NODEFER)) { 3347 spin_lock_irq(¤t->sigmask_lock); 3348 sigorsets(¤t->blocked,¤t->blocked, 3349 &ka->sa.sa_mask); 3350 sigaddset(¤t->blocked,sig); 3351 recalc_sigpending(current); 3352 spin_unlock_irq(¤t->sigmask_lock); 3353 } 3354 } 3355 3356 /* Note that 'init' is a special process: it doesn't get 3357 * signals it doesn't want to handle. Thus you cannot 3358 * kill init even with a SIGKILL even by mistake. 3359 * 3360 * Note that we go through the signals twice: once to 3361 * check the signals that the kernel can handle, and then 3362 * we build all the user-level signal handling 3363 * stack-frames in one go after that. */ 3364 int do_signal(struct pt_regs *regs, sigset_t *oldset) 3365 { 3366 siginfo_t info; 3367 struct k_sigaction *ka; 3368 3369 /* We want the common case to go fast, which is why we 3370 * may in certain cases get here from kernel mode. Just 3371 * return without doing anything if so. */ 3372 if ((regs->xcs & 3) != 3) 3373 return 1; 3374 3375 if (!oldset) 3376 oldset = ¤t->blocked; 3377 3378 for (;;) { 3379 unsigned long signr; 3380 3381 spin_lock_irq(¤t->sigmask_lock); 3382 signr = dequeue_signal(¤t->blocked, &info); 3383 spin_unlock_irq(¤t->sigmask_lock); 3384 3385 if (!signr) 3386 break; 3387 3388 if ((current->flags & PF_PTRACED) && 3389 signr != SIGKILL) { 3390 /* Let the debugger run. */ 3391 current->exit_code = signr; 3392 current->state = TASK_STOPPED; 3393 notify_parent(current, SIGCHLD); 3394 schedule(); 3395 3396 /* We're back. Did the debugger cancel the sig? */ 3397 if (!(signr = current->exit_code)) 3398 continue; 3399 current->exit_code = 0; 3400 3401 /* The debugger continued. Ignore SIGSTOP. */ 3402 if (signr == SIGSTOP) 3403 continue; 3404 3405 /* Update the siginfo structure. Is this good? */ 3406 if (signr != info.si_signo) { 3407 info.si_signo = signr; 3408 info.si_errno = 0; 3409 info.si_code = SI_USER; 3410 info.si_pid = current->p_pptr->pid; 3411 info.si_uid = current->p_pptr->uid; 3412 } 3413 3414 /* If (new) signal is now blocked, requeue it. */ 3415 if (sigismember(¤t->blocked, signr)) { 3416 send_sig_info(signr, &info, current); 3417 continue; 3418 } 3419 } 3420 3421 ka = ¤t->sig->action[signr-1]; 3422 if (ka->sa.sa_handler == SIG_IGN) { 3423 if (signr != SIGCHLD) 3424 continue; 3425 /* Check for SIGCHLD: it's special. */ 3426 while (sys_wait4(-1, NULL, WNOHANG, NULL) > 0) 3427 /* nothing */; 3428 continue; 3429 } 3430 3431 if (ka->sa.sa_handler == SIG_DFL) { 3432 int exit_code = signr; 3433 3434 /* Init gets no signals it doesn't want. */ 3435 if (current->pid == 1) 3436 continue; 3437 3438 switch (signr) { 3439 case SIGCONT: case SIGCHLD: case SIGWINCH: 3440 continue; 3441 3442 case SIGTSTP: case SIGTTIN: case SIGTTOU: 3443 if (is_orphaned_pgrp(current->pgrp)) 3444 continue; 3445 /* FALLTHRU */ 3446 3447 case SIGSTOP: 3448 current->state = TASK_STOPPED; 3449 current->exit_code = signr; 3450 if (!(current->p_pptr->sig->action[SIGCHLD-1]. 3451 sa.sa_flags & SA_NOCLDSTOP)) 3452 notify_parent(current, SIGCHLD); 3453 schedule(); 3454 continue; 3455 3456 case SIGQUIT: case SIGILL: case SIGTRAP: 3457 case SIGABRT: case SIGFPE: case SIGSEGV: 3458 lock_kernel(); 3459 if (current->binfmt 3460 && current->binfmt->core_dump 3461 && current->binfmt->core_dump(signr, regs)) 3462 exit_code |= 0x80; 3463 unlock_kernel(); 3464 /* FALLTHRU */ 3465 3466 default: 3467 lock_kernel(); 3468 sigaddset(¤t->signal, signr); 3469 current->flags |= PF_SIGNALED; 3470 do_exit(exit_code); 3471 /* NOTREACHED */ 3472 } 3473 } 3474 3475 /* Whee! Actually deliver the signal. */ 3476 handle_signal(signr, ka, &info, oldset, regs); 3477 return 1; 3478 } 3479 3480 /* Did we come from a system call? */ 3481 if (regs->orig_eax >= 0) { 3482 /* Restart the system call - no handlers present */ 3483 if (regs->eax == -ERESTARTNOHAND || 3484 regs->eax == -ERESTARTSYS || 3485 regs->eax == -ERESTARTNOINTR) { 3486 regs->eax = regs->orig_eax; 3487 regs->eip -= 2; 3488 } 3489 } 3490 return 0; 3491 } /* FILE: arch/i386/kernel/smp.c */ 3492 /* 3493 * Intel MP v1.1/v1.4 specification support routines 3494 * for multi-pentium hosts. 3495 * 3496 * (c) 1995 Alan Cox, CymruNET Ltd 3497 * (c) 1998 Ingo Molnar 3498 * 3499 * Supported by Caldera http://www.caldera.com. 3500 * Much of the core SMP work is based on previous 3501 * work by Thomas Radke, to whom a great many thanks 3502 * are extended. 3503 * 3504 * Thanks to Intel for making available several 3505 * different Pentium, Pentium Pro and 3506 * Pentium-II/Xeon MP machines. 3507 * 3508 * This code is released under the GNU public 3509 * license version 2 or later. 3510 * 3511 * Fixes 3512 * Felix Koop : NR_CPUS used properly 3513 * Jose Renau : Handle single CPU case. 3514 * Alan Cox : By repeated request 8) - 3515 * Total BogoMIP report. 3516 * Greg Wright : Fix for kernel stacks panic. 3517 * Erich Boleyn : MP v1.4 and additional changes. 3518 * Matthias Sattler : Changes for 2.1 kernel map. 3519 * Michel Lespinasse: Changes for 2.1 kernel map. 3520 * Michael Chastain : Change trampoline.S to gnu as. 3521 * Alan Cox : Dumb bug: 'B' step PPro's are fine 3522 * Ingo Molnar : Added APIC timers, based on code 3523 * from Jose Renau 3524 * Alan Cox : Added EBDA scanning 3525 * Ingo Molnar : various cleanups and rewrites */ 3526 3527 #include 3528 #include 3529 #include 3530 #include 3531 #include 3532 #include 3533 #include 3534 #include 3535 3536 #include "irq.h" 3537 3538 extern unsigned long start_kernel; 3539 extern void update_one_process( struct task_struct *p, 3540 unsigned long ticks, unsigned long user, 3541 unsigned long system, int cpu); 3542 /* Some notes on processor bugs: 3543 * 3544 * Pentium and Pentium Pro (and all CPUs) have 3545 * bugs. The Linux issues for SMP are handled as 3546 * follows. 3547 * 3548 * Pentium Pro: 3549 * Occasional delivery of 'spurious interrupt' as trap 3550 * #16. This is very rare. The kernel logs the event and 3551 * recovers 3552 * 3553 * Pentium: 3554 * There is a marginal case where REP MOVS on 100MHz SMP 3555 * machines with B stepping processors can fail. XXX 3556 * should provide an L1cache=Writethrough or L1cache=off 3557 * option. 3558 * 3559 * B stepping CPUs may hang. There are hardware work 3560 * arounds for this. We warn about it in case your board 3561 * doesnt have the work arounds. Basically thats so I can 3562 * tell anyone with a B stepping CPU and SMP problems 3563 * "tough". 3564 * 3565 * Specific items [From Pentium Processor 3566 * Specification Update] 3567 * 3568 * 1AP. Linux doesn't use remote read 3569 * 2AP. Linux doesn't trust APIC errors 3570 * 3AP. We work around this 3571 * 4AP. Linux never generated 3 interrupts of the 3572 * same pri to cause a lost local interrupt. 3573 * 5AP. Remote read is never used 3574 * 9AP. XXX NEED TO CHECK WE HANDLE THIS XXX 3575 * 10AP. XXX NEED TO CHECK WE HANDLE THIS XXX 3576 * 11AP. Linux reads the APIC between writes to 3577 * avoid this, as per the documentation. Make 3578 * sure you preserve this as it affects the C 3579 * stepping chips too. 3580 * 3581 * If this sounds worrying believe me these bugs are 3582 * ___RARE___ and there's about nothing of note with 3583 * C stepping upwards. */ 3584 3585 3586 /* Kernel spinlock */ 3587 spinlock_t kernel_flag = SPIN_LOCK_UNLOCKED; 3588 3589 /* function prototypes: */ 3590 3591 static void cache_APIC_registers (void); 3592 static void stop_this_cpu (void); 3593 3594 /* Set if we find a B stepping CPU */ 3595 static int smp_b_stepping = 0; 3596 3597 /* Setup configured maximum number of CPUs to activate */ 3598 static int max_cpus = -1; 3599 /* Have we found an SMP box */ 3600 int smp_found_config=0; 3601 3602 /* Bitmask of physically existing CPUs */ 3603 unsigned long cpu_present_map = 0; 3604 /* Bitmask of currently online CPUs */ 3605 unsigned long cpu_online_map = 0; 3606 /* Total count of live CPUs */ 3607 int smp_num_cpus = 1; 3608 /* Set when the idlers are all forked */ 3609 int smp_threads_ready=0; 3610 /* which CPU maps to which logical number */ 3611 volatile int cpu_number_map[NR_CPUS]; 3612 /* which logical number maps to which CPU */ 3613 volatile int __cpu_logical_map[NR_CPUS]; 3614 /* We always use 0 the rest is ready for parallel 3615 * delivery */ 3616 static volatile 3617 unsigned long cpu_callin_map[NR_CPUS] = {0,}; 3618 /* We always use 0 the rest is ready for parallel 3619 * delivery */ 3620 static volatile 3621 unsigned long cpu_callout_map[NR_CPUS] = {0,}; 3622 /* Used for the invalidate map that's also checked in the 3623 * spinlock */ 3624 volatile unsigned long smp_invalidate_needed; 3625 /* Stack vector for booting CPUs */ 3626 volatile unsigned long kstack_ptr; 3627 /* Per CPU bogomips and other parameters */ 3628 struct cpuinfo_x86 cpu_data[NR_CPUS]; 3629 /* Internal processor count */ 3630 static unsigned int num_processors = 1; 3631 /* Address of the I/O apic (not yet used) */ 3632 unsigned long mp_ioapic_addr = 0xFEC00000; 3633 /* Processor that is doing the boot up */ 3634 unsigned char boot_cpu_id = 0; 3635 /* Tripped once we need to start cross invalidating */ 3636 static int smp_activated = 0; 3637 /* APIC version number */ 3638 int apic_version[NR_CPUS]; 3639 /* Just debugging the assembler.. */ 3640 unsigned long apic_retval; 3641 3642 /* Number of times the processor holds the lock */ 3643 volatile unsigned long kernel_counter=0; 3644 /* Number of times the processor holds the syscall lock*/ 3645 volatile unsigned long syscall_count=0; 3646 3647 /* Number of IPIs delivered */ 3648 volatile unsigned long ipi_count; 3649 3650 const char lk_lockmsg[] = 3651 "lock from interrupt context at %p\n"; 3652 3653 int mp_bus_id_to_type [MAX_MP_BUSSES] = { -1, }; 3654 extern int mp_irq_entries; 3655 extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 3656 extern int mpc_default_type; 3657 int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { -1, }; 3658 int mp_current_pci_id = 0; 3659 unsigned long mp_lapic_addr = 0; 3660 /* 1 if "noapic" boot option passed */ 3661 int skip_ioapic_setup = 0; 3662 3663 /* #define SMP_DEBUG */ 3664 3665 #ifdef SMP_DEBUG 3666 #define SMP_PRINTK(x) printk x 3667 #else 3668 #define SMP_PRINTK(x) 3669 #endif 3670 3671 /* IA s/w dev Vol 3, Section 7.4 */ 3672 #define APIC_DEFAULT_PHYS_BASE 0xfee00000 3673 3674 /* Reads and clears the Pentium Timestamp-Counter */ 3675 #define READ_TSC(x) __asm__ __volatile__ ( "rdtsc" \ 3676 :"=a" (((unsigned long*)&(x))[0]), \ 3677 "=d" (((unsigned long*)&(x))[1])) 3678 3679 #define CLEAR_TSC \ 3680 __asm__ __volatile__ ("\t.byte 0x0f, 0x30;\n":: \ 3681 "a"(0x00001000), "d"(0x00001000), "c"(0x10):"memory") 3682 3683 /* Setup routine for controlling SMP activation 3684 * 3685 * Command-line option of "nosmp" or "maxcpus=0" 3686 * will disable SMP activation entirely (the MPS 3687 * table probe still happens, though). 3688 * 3689 * Command-line option of "maxcpus=", where 3690 * is an integer greater than 0, limits the 3691 * maximum number of CPUs activated in SMP mode to 3692 * . */ 3693 3694 void __init smp_setup(char *str, int *ints) 3695 { 3696 if (ints && ints[0] > 0) 3697 max_cpus = ints[1]; 3698 else 3699 max_cpus = 0; 3700 } 3701 3702 void ack_APIC_irq(void) 3703 { 3704 /* Clear the IPI */ 3705 3706 /* Dummy read */ 3707 apic_read(APIC_SPIV); 3708 3709 /* Docs say use 0 for future compatibility */ 3710 apic_write(APIC_EOI, 0); 3711 } 3712 3713 /* Intel MP BIOS table parsing routines: */ 3714 3715 #ifndef CONFIG_X86_VISWS_APIC 3716 /* Checksum an MP configuration block. */ 3717 3718 static int mpf_checksum(unsigned char *mp, int len) 3719 { 3720 int sum=0; 3721 while(len--) 3722 sum+=*mp++; 3723 return sum&0xFF; 3724 } 3725 3726 /* Processor encoding in an MP configuration block */ 3727 3728 static char *mpc_family(int family,int model) 3729 { 3730 static char n[32]; 3731 static char *model_defs[]= 3732 { 3733 "80486DX","80486DX", 3734 "80486SX","80486DX/2 or 80487", 3735 "80486SL","Intel5X2(tm)", 3736 "Unknown","Unknown", 3737 "80486DX/4" 3738 }; 3739 if (family==0x6) 3740 return("Pentium(tm) Pro"); 3741 if (family==0x5) 3742 return("Pentium(tm)"); 3743 if (family==0x0F && model==0x0F) 3744 return("Special controller"); 3745 if (family==0x04 && model<9) 3746 return model_defs[model]; 3747 sprintf(n,"Unknown CPU [%d:%d]",family, model); 3748 return n; 3749 } 3750 3751 /* Read the MPC */ 3752 3753 static int __init 3754 smp_read_mpc(struct mp_config_table *mpc) 3755 { 3756 char str[16]; 3757 int count=sizeof(*mpc); 3758 int ioapics = 0; 3759 unsigned char *mpt=((unsigned char *)mpc)+count; 3760 3761 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) 3762 { 3763 panic("SMP mptable: bad signature [%c%c%c%c]!\n", 3764 mpc->mpc_signature[0], 3765 mpc->mpc_signature[1], 3766 mpc->mpc_signature[2], 3767 mpc->mpc_signature[3]); 3768 return 1; 3769 } 3770 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) 3771 { 3772 panic("SMP mptable: checksum error!\n"); 3773 return 1; 3774 } 3775 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) 3776 { 3777 printk("Bad Config Table version (%d)!!\n", 3778 mpc->mpc_spec); 3779 return 1; 3780 } 3781 memcpy(str,mpc->mpc_oem,8); 3782 str[8]=0; 3783 memcpy(ioapic_OEM_ID,str,9); 3784 printk("OEM ID: %s ",str); 3785 3786 memcpy(str,mpc->mpc_productid,12); 3787 str[12]=0; 3788 memcpy(ioapic_Product_ID,str,13); 3789 printk("Product ID: %s ",str); 3790 3791 printk("APIC at: 0x%lX\n",mpc->mpc_lapic); 3792 3793 /* save the local APIC address, it might be 3794 * non-default */ 3795 mp_lapic_addr = mpc->mpc_lapic; 3796 3797 /* Now process the configuration blocks. */ 3798 3799 while(countmpc_length) 3800 { 3801 switch(*mpt) 3802 { 3803 case MP_PROCESSOR: 3804 { 3805 struct mpc_config_processor *m= 3806 (struct mpc_config_processor *)mpt; 3807 if (m->mpc_cpuflag&CPU_ENABLED) 3808 { 3809 printk("Processor #%d %s APIC version %d\n", 3810 m->mpc_apicid, 3811 mpc_family((m->mpc_cpufeature& 3812 CPU_FAMILY_MASK)>>8, 3813 (m->mpc_cpufeature& 3814 CPU_MODEL_MASK)>>4), 3815 m->mpc_apicver); 3816 #ifdef SMP_DEBUG 3817 if (m->mpc_featureflag&(1<<0)) 3818 printk(" Floating point unit present.\n"); 3819 if (m->mpc_featureflag&(1<<7)) 3820 printk(" Machine Exception supported.\n"); 3821 if (m->mpc_featureflag&(1<<8)) 3822 printk(" 64 bit compare & exchange " 3823 "supported.\n"); 3824 if (m->mpc_featureflag&(1<<9)) 3825 printk(" Internal APIC present.\n"); 3826 #endif 3827 if (m->mpc_cpuflag&CPU_BOOTPROCESSOR) 3828 { 3829 SMP_PRINTK((" Bootup CPU\n")); 3830 boot_cpu_id=m->mpc_apicid; 3831 } 3832 else /* Boot CPU already counted */ 3833 num_processors++; 3834 3835 if (m->mpc_apicid>NR_CPUS) 3836 printk("Processor #%d unused. (Max %d " 3837 "processors).\n",m->mpc_apicid, NR_CPUS); 3838 else 3839 { 3840 int ver = m->mpc_apicver; 3841 3842 cpu_present_map|=(1<mpc_apicid); 3843 /* Validate version */ 3844 if (ver == 0x0) { 3845 printk("BIOS bug, APIC version is 0 for " 3846 "CPU#%d! fixing up to 0x10. (tell " 3847 "your hw vendor)\n", m->mpc_apicid); 3848 ver = 0x10; 3849 } 3850 apic_version[m->mpc_apicid] = ver; 3851 } 3852 } 3853 mpt+=sizeof(*m); 3854 count+=sizeof(*m); 3855 break; 3856 } 3857 case MP_BUS: 3858 { 3859 struct mpc_config_bus *m= 3860 (struct mpc_config_bus *)mpt; 3861 memcpy(str,m->mpc_bustype,6); 3862 str[6]=0; 3863 SMP_PRINTK(("Bus #%d is %s\n", 3864 m->mpc_busid, 3865 str)); 3866 if ((strncmp(m->mpc_bustype,"ISA",3) == 0) || 3867 (strncmp(m->mpc_bustype,"EISA",4) == 0)) 3868 mp_bus_id_to_type[m->mpc_busid] = 3869 MP_BUS_ISA; 3870 else 3871 if (strncmp(m->mpc_bustype,"PCI",3) == 0) { 3872 mp_bus_id_to_type[m->mpc_busid] = 3873 MP_BUS_PCI; 3874 mp_bus_id_to_pci_bus[m->mpc_busid] = 3875 mp_current_pci_id; 3876 mp_current_pci_id++; 3877 } 3878 mpt+=sizeof(*m); 3879 count+=sizeof(*m); 3880 break; 3881 } 3882 case MP_IOAPIC: 3883 { 3884 struct mpc_config_ioapic *m= 3885 (struct mpc_config_ioapic *)mpt; 3886 if (m->mpc_flags&MPC_APIC_USABLE) 3887 { 3888 ioapics++; 3889 printk("I/O APIC #%d Version %d at 0x%lX.\n", 3890 m->mpc_apicid,m->mpc_apicver, 3891 m->mpc_apicaddr); 3892 /* we use the first one only currently */ 3893 if (ioapics == 1) 3894 mp_ioapic_addr = m->mpc_apicaddr; 3895 } 3896 mpt+=sizeof(*m); 3897 count+=sizeof(*m); 3898 break; 3899 } 3900 case MP_INTSRC: 3901 { 3902 struct mpc_config_intsrc *m= 3903 (struct mpc_config_intsrc *)mpt; 3904 3905 mp_irqs [mp_irq_entries] = *m; 3906 if (++mp_irq_entries == MAX_IRQ_SOURCES) { 3907 printk("Max irq sources exceeded!!\n"); 3908 printk("Skipping remaining sources.\n"); 3909 --mp_irq_entries; 3910 } 3911 3912 mpt+=sizeof(*m); 3913 count+=sizeof(*m); 3914 break; 3915 } 3916 case MP_LINTSRC: 3917 { 3918 struct mpc_config_intlocal *m= 3919 (struct mpc_config_intlocal *)mpt; 3920 mpt+=sizeof(*m); 3921 count+=sizeof(*m); 3922 break; 3923 } 3924 } 3925 } 3926 if (ioapics > 1) 3927 { 3928 printk("Warning: " 3929 "Multiple IO-APICs not yet supported.\n"); 3930 printk("Warning: switching to non APIC mode.\n"); 3931 skip_ioapic_setup=1; 3932 } 3933 return num_processors; 3934 } 3935 3936 /* Scan the memory blocks for an SMP configuration block. 3937 */ 3938 3939 static int __init smp_scan_config(unsigned long base, 3940 unsigned long length) 3941 { 3942 unsigned long *bp=phys_to_virt(base); 3943 struct intel_mp_floating *mpf; 3944 3945 SMP_PRINTK(("Scan SMP from %p for %ld bytes.\n", 3946 bp,length)); 3947 if (sizeof(*mpf)!=16) 3948 printk("Error: MPF size\n"); 3949 3950 while (length>0) 3951 { 3952 if (*bp==SMP_MAGIC_IDENT) 3953 { 3954 mpf=(struct intel_mp_floating *)bp; 3955 if (mpf->mpf_length==1 && 3956 !mpf_checksum((unsigned char *)bp,16) && 3957 (mpf->mpf_specification == 1 3958 || mpf->mpf_specification == 4) ) 3959 { 3960 printk("Intel MultiProcessor Specification " 3961 "v1.%d\n", mpf->mpf_specification); 3962 if (mpf->mpf_feature2&(1<<7)) 3963 printk(" IMCR and PIC " 3964 "compatibility mode.\n"); 3965 else 3966 printk(" Virtual Wire " 3967 "compatibility mode.\n"); 3968 smp_found_config=1; 3969 /* Now see if we need to read further. */ 3970 if (mpf->mpf_feature1!=0) 3971 { 3972 unsigned long cfg; 3973 3974 /* local APIC has default address */ 3975 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 3976 /* We need to know what the local APIC id of 3977 * the boot CPU is! */ 3978 3979 /* HACK HACK HACK HACK HACK HACK HACK HACK HACK HACK 3980 * It's not just a crazy hack. ;-) */ 3981 3982 /* Standard page mapping functions don't work 3983 * yet. We know that page 0 is not used. 3984 * Steal it for now! */ 3985 3986 cfg=pg0[0]; 3987 pg0[0] = (mp_lapic_addr | 3988 _PAGE_RW | _PAGE_PRESENT); 3989 local_flush_tlb(); 3990 3991 boot_cpu_id = 3992 GET_APIC_ID(*((volatile unsigned long *) 3993 APIC_ID)); 3994 3995 /* Give it back */ 3996 pg0[0]= cfg; 3997 local_flush_tlb(); 3998 3999 /* 4000 * END OF HACK END OF HACK END OF HACK END OF HACK 4001 */ 4002 /* 2 CPUs, numbered 0 & 1. */ 4003 cpu_present_map=3; 4004 num_processors=2; 4005 printk("I/O APIC at 0xFEC00000.\n"); 4006 4007 /* Save the default type number, we need it 4008 * later to set the IO-APIC up properly: */ 4009 mpc_default_type = mpf->mpf_feature1; 4010 4011 printk("Bus #0 is "); 4012 } 4013 switch(mpf->mpf_feature1) 4014 { 4015 case 1: 4016 case 5: 4017 printk("ISA\n"); 4018 break; 4019 case 2: 4020 printk("EISA with no IRQ8 chaining\n"); 4021 break; 4022 case 6: 4023 case 3: 4024 printk("EISA\n"); 4025 break; 4026 case 4: 4027 case 7: 4028 printk("MCA\n"); 4029 break; 4030 case 0: 4031 break; 4032 default: 4033 printk("???\nUnknown standard configuration " 4034 "%d\n", mpf->mpf_feature1); 4035 return 1; 4036 } 4037 if (mpf->mpf_feature1>4) 4038 { 4039 printk("Bus #1 is PCI\n"); 4040 4041 /* Set local APIC version to the integrated 4042 * form. It's initialized to zero otherwise, 4043 * representing a discrete 82489DX. */ 4044 apic_version[0] = 0x10; 4045 apic_version[1] = 0x10; 4046 } 4047 /* Read the physical hardware table. Anything 4048 * here will override the defaults. */ 4049 if (mpf->mpf_physptr) 4050 smp_read_mpc((void *)mpf->mpf_physptr); 4051 4052 __cpu_logical_map[0] = boot_cpu_id; 4053 global_irq_holder = boot_cpu_id; 4054 current->processor = boot_cpu_id; 4055 4056 printk("Processors: %d\n", num_processors); 4057 /* Only use the first configuration found. */ 4058 return 1; 4059 } 4060 } 4061 bp+=4; 4062 length-=16; 4063 } 4064 4065 return 0; 4066 } 4067 4068 void __init init_intel_smp (void) 4069 { 4070 /* FIXME: Linux assumes you have 640K of base ram.. 4071 * this continues the error... 4072 * 4073 * 1) Scan the bottom 1K for a signature 4074 * 2) Scan the top 1K of base RAM 4075 * 3) Scan the 64K of bios */ 4076 if (!smp_scan_config(0x0,0x400) && 4077 !smp_scan_config(639*0x400,0x400) && 4078 !smp_scan_config(0xF0000,0x10000)) { 4079 /* If it is an SMP machine we should know now, unless 4080 * the configuration is in an EISA/MCA bus machine 4081 * with an extended bios data area. 4082 * 4083 * there is a real-mode segmented pointer pointing to 4084 * the 4K EBDA area at 0x40E, calculate and scan it 4085 * here. 4086 * 4087 * NOTE! There are Linux loaders that will corrupt 4088 * the EBDA area, and as such this kind of SMP config 4089 * may be less trustworthy, simply because the SMP 4090 * table may have been stomped on during early 4091 * boot. These loaders are buggy and should be fixed. 4092 */ 4093 unsigned int address; 4094 4095 address = *(unsigned short *)phys_to_virt(0x40E); 4096 address<<=4; 4097 smp_scan_config(address, 0x1000); 4098 if (smp_found_config) 4099 printk(KERN_WARNING "WARNING: MP table in the EBDA" 4100 " can be UNSAFE, contact linux-smp@vger.rutgers." 4101 "edu if you experience SMP problems!\n"); 4102 } 4103 } 4104 4105 #else 4106 4107 /* The Visual Workstation is Intel MP compliant in the 4108 * hardware sense, but it doesnt have a 4109 * BIOS(-configuration table). No problem for Linux. */ 4110 void __init init_visws_smp(void) 4111 { 4112 smp_found_config = 1; 4113 4114 cpu_present_map |= 2; /* or in id 1 */ 4115 apic_version[1] |= 0x10; /* integrated APIC */ 4116 apic_version[0] |= 0x10; 4117 4118 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 4119 } 4120 4121 #endif 4122 4123 /* - Intel MP Configuration Table 4124 * - or SGI Visual Workstation configuration */ 4125 void __init init_smp_config (void) 4126 { 4127 #ifndef CONFIG_VISWS 4128 init_intel_smp(); 4129 #else 4130 init_visws_smp(); 4131 #endif 4132 } 4133 4134 /* Trampoline 80x86 program as an array. */ 4135 4136 extern unsigned char trampoline_data []; 4137 extern unsigned char trampoline_end []; 4138 static unsigned char *trampoline_base; 4139 4140 /* Currently trivial. Write the real->protected mode 4141 * bootstrap into the page concerned. The caller has made 4142 * sure it's suitably aligned. */ 4143 4144 static unsigned long __init setup_trampoline(void) 4145 { 4146 memcpy(trampoline_base, trampoline_data, 4147 trampoline_end - trampoline_data); 4148 return virt_to_phys(trampoline_base); 4149 } 4150 4151 /* We are called very early to get the low memory for the 4152 * SMP bootup trampoline page. */ 4153 unsigned long __init 4154 smp_alloc_memory(unsigned long mem_base) 4155 { 4156 if (virt_to_phys((void *)mem_base) >= 0x9F000) 4157 panic("smp_alloc_memory: Insufficient low memory for" 4158 " kernel trampoline 0x%lx.", mem_base); 4159 trampoline_base = (void *)mem_base; 4160 return mem_base + PAGE_SIZE; 4161 } 4162 4163 /* The bootstrap kernel entry code has set these up. Save 4164 * them for a given CPU */ 4165 void __init smp_store_cpu_info(int id) 4166 { 4167 struct cpuinfo_x86 *c=&cpu_data[id]; 4168 4169 *c = boot_cpu_data; 4170 c->pte_quick = 0; 4171 c->pgd_quick = 0; 4172 c->pgtable_cache_sz = 0; 4173 identify_cpu(c); 4174 /* Mask B, Pentium, but not Pentium MMX */ 4175 if (c->x86_vendor == X86_VENDOR_INTEL && 4176 c->x86 == 5 && 4177 c->x86_mask >= 1 && c->x86_mask <= 4 && 4178 c->x86_model <= 3) 4179 /* Remember we have B step Pentia with bugs */ 4180 smp_b_stepping=1; 4181 } 4182 4183 /* Architecture specific routine called by the kernel 4184 * just before init is fired off. This allows the BP to 4185 * have everything in order [we hope]. At the end of 4186 * this all the APs will hit the system scheduling and 4187 * off we go. Each AP will load the system gdt's and jump 4188 * through the kernel init into idle(). At this point the 4189 * scheduler will one day take over and give them jobs to 4190 * do. smp_callin is a standard routine we use to track 4191 * CPUs as they power up. */ 4192 4193 static atomic_t smp_commenced = ATOMIC_INIT(0); 4194 4195 void __init smp_commence(void) 4196 { 4197 /* Lets the callins below out of their loop. */ 4198 SMP_PRINTK(("Setting commenced=1, go go go\n")); 4199 4200 wmb(); 4201 atomic_set(&smp_commenced,1); 4202 } 4203 4204 void __init enable_local_APIC(void) 4205 { 4206 unsigned long value; 4207 4208 value = apic_read(APIC_SPIV); 4209 value |= (1<<8); /* Enable APIC (bit==1) */ 4210 value &= ~(1<<9); /* Enable focus processor (bit==0) */ 4211 value |= 0xff; /* Set spurious IRQ vector to 0xff */ 4212 apic_write(APIC_SPIV,value); 4213 4214 /* Set Task Priority to 'accept all' */ 4215 value = apic_read(APIC_TASKPRI); 4216 value &= ~APIC_TPRI_MASK; 4217 apic_write(APIC_TASKPRI,value); 4218 4219 /* Clear the logical destination ID, just to be safe. 4220 * also, put the APIC into flat delivery mode. */ 4221 value = apic_read(APIC_LDR); 4222 value &= ~APIC_LDR_MASK; 4223 apic_write(APIC_LDR,value); 4224 4225 value = apic_read(APIC_DFR); 4226 value |= SET_APIC_DFR(0xf); 4227 apic_write(APIC_DFR, value); 4228 4229 udelay(100); /* B safe */ 4230 } 4231 4232 unsigned long __init 4233 init_smp_mappings(unsigned long memory_start) 4234 { 4235 unsigned long apic_phys; 4236 4237 memory_start = PAGE_ALIGN(memory_start); 4238 if (smp_found_config) { 4239 apic_phys = mp_lapic_addr; 4240 } else { 4241 /* set up a fake all zeroes page to simulate the 4242 * local APIC and another one for the IO-APIC. We 4243 * could use the real zero-page, but it's safer this 4244 * way if some buggy code writes to this page ... */ 4245 apic_phys = __pa(memory_start); 4246 memset((void *)memory_start, 0, PAGE_SIZE); 4247 memory_start += PAGE_SIZE; 4248 } 4249 set_fixmap(FIX_APIC_BASE,apic_phys); 4250 printk("mapped APIC to %08lx (%08lx)\n", 4251 APIC_BASE, apic_phys); 4252 4253 #ifdef CONFIG_X86_IO_APIC 4254 { 4255 unsigned long ioapic_phys; 4256 4257 if (smp_found_config) { 4258 ioapic_phys = mp_ioapic_addr; 4259 } else { 4260 ioapic_phys = __pa(memory_start); 4261 memset((void *)memory_start, 0, PAGE_SIZE); 4262 memory_start += PAGE_SIZE; 4263 } 4264 set_fixmap(FIX_IO_APIC_BASE,ioapic_phys); 4265 printk("mapped IOAPIC to %08lx (%08lx)\n", 4266 fix_to_virt(FIX_IO_APIC_BASE), ioapic_phys); 4267 } 4268 #endif 4269 4270 return memory_start; 4271 } 4272 4273 extern void calibrate_delay(void); 4274 4275 void __init smp_callin(void) 4276 { 4277 int cpuid; 4278 unsigned long timeout; 4279 4280 /* (This works even if the APIC is not enabled.) */ 4281 cpuid = GET_APIC_ID(apic_read(APIC_ID)); 4282 4283 SMP_PRINTK(("CPU#%d waiting for CALLOUT\n", cpuid)); 4284 4285 /* STARTUP IPIs are fragile beasts as they might 4286 * sometimes trigger some glue motherboard 4287 * logic. Complete APIC bus silence for 1 second, this 4288 * overestimates the time the boot CPU is spending to 4289 * send the up to 2 STARTUP IPIs by a factor of 4290 * two. This should be enough. */ 4291 4292 /* Waiting 2s total for startup (udelay is not yet 4293 * working) */ 4294 timeout = jiffies + 2*HZ; 4295 while (time_before(jiffies,timeout)) 4296 { 4297 /* Has the boot CPU finished its STARTUP sequence? */ 4298 if (test_bit(cpuid, 4299 (unsigned long *)&cpu_callout_map[0])) 4300 break; 4301 } 4302 4303 while (!time_before(jiffies,timeout)) { 4304 printk("BUG: CPU%d started up but did not get a " 4305 "callout!\n", cpuid); 4306 stop_this_cpu(); 4307 } 4308 4309 /* the boot CPU has finished the init stage and is 4310 * spinning on callin_map until we finish. We are free 4311 * to set up this CPU, first the APIC. (this is 4312 * probably redundant on most boards) */ 4313 SMP_PRINTK(("CALLIN, before enable_local_APIC().\n")); 4314 enable_local_APIC(); 4315 4316 /* Set up our APIC timer. */ 4317 setup_APIC_clock(); 4318 4319 __sti(); 4320 4321 #ifdef CONFIG_MTRR 4322 /* Must be done before calibration delay is computed */ 4323 mtrr_init_secondary_cpu (); 4324 #endif 4325 /* Get our bogomips. */ 4326 calibrate_delay(); 4327 SMP_PRINTK(("Stack at about %p\n",&cpuid)); 4328 4329 /* Save our processor parameters */ 4330 smp_store_cpu_info(cpuid); 4331 4332 /* Allow the master to continue. */ 4333 set_bit(cpuid, (unsigned long *)&cpu_callin_map[0]); 4334 } 4335 4336 int cpucount = 0; 4337 4338 extern int cpu_idle(void * unused); 4339 4340 /* Activate a secondary processor. */ 4341 int __init start_secondary(void *unused) 4342 { 4343 /* Don't put anything before smp_callin(), SMP booting 4344 * is too fragile that we want to limit the things done 4345 * here to the most necessary things. */ 4346 smp_callin(); 4347 while (!atomic_read(&smp_commenced)) 4348 /* nothing */ ; 4349 return cpu_idle(NULL); 4350 } 4351 4352 /* Everything has been set up for the secondary CPUs - 4353 * they just need to reload everything from the task 4354 * structure */ 4355 void __init initialize_secondary(void) 4356 { 4357 struct thread_struct * p = ¤t->tss; 4358 4359 /* Load up the LDT and the task register. */ 4360 asm volatile("lldt %%ax": :"a" (p->ldt)); 4361 asm volatile("ltr %%ax": :"a" (p->tr)); 4362 stts(); 4363 4364 /* We don't actually need to load the full TSS, 4365 * basically just the stack pointer and the eip. */ 4366 4367 asm volatile( 4368 "movl %0,%%esp\n\t" 4369 "jmp *%1" 4370 : 4371 :"r" (p->esp),"r" (p->eip)); 4372 } 4373 4374 extern struct { 4375 void * esp; 4376 unsigned short ss; 4377 } stack_start; 4378 4379 static void __init do_boot_cpu(int i) 4380 { 4381 unsigned long cfg; 4382 pgd_t maincfg; 4383 struct task_struct *idle; 4384 unsigned long send_status, accept_status; 4385 int timeout, num_starts, j; 4386 unsigned long start_eip; 4387 4388 /* We need an idle process for each processor. */ 4389 4390 kernel_thread(start_secondary, NULL, CLONE_PID); 4391 cpucount++; 4392 4393 idle = task[cpucount]; 4394 if (!idle) 4395 panic("No idle process for CPU %d", i); 4396 4397 idle->processor = i; 4398 __cpu_logical_map[cpucount] = i; 4399 cpu_number_map[i] = cpucount; 4400 4401 /* start_eip had better be page-aligned! */ 4402 start_eip = setup_trampoline(); 4403 4404 /* So we see what's up */ 4405 printk("Booting processor %d eip %lx\n", i, start_eip); 4406 stack_start.esp = (void *) (1024 + PAGE_SIZE + 4407 (char *)idle); 4408 4409 /* This grunge runs the startup process for the 4410 * targeted processor. */ 4411 4412 SMP_PRINTK(("Setting warm reset code and vector.\n")); 4413 4414 CMOS_WRITE(0xa, 0xf); 4415 local_flush_tlb(); 4416 SMP_PRINTK(("1.\n")); 4417 *((volatile unsigned short *) phys_to_virt(0x469)) = 4418 start_eip >> 4; 4419 SMP_PRINTK(("2.\n")); 4420 *((volatile unsigned short *) phys_to_virt(0x467)) = 4421 start_eip & 0xf; 4422 SMP_PRINTK(("3.\n")); 4423 4424 maincfg=swapper_pg_dir[0]; 4425 ((unsigned long *)swapper_pg_dir)[0]=0x102007; 4426 4427 /* Be paranoid about clearing APIC errors. */ 4428 4429 if ( apic_version[i] & 0xF0 ) 4430 { 4431 apic_write(APIC_ESR, 0); 4432 accept_status = (apic_read(APIC_ESR) & 0xEF); 4433 } 4434 4435 /* Status is now clean */ 4436 4437 send_status = 0; 4438 accept_status = 0; 4439 4440 /* Starting actual IPI sequence... */ 4441 4442 SMP_PRINTK(("Asserting INIT.\n")); 4443 4444 /* Turn INIT on */ 4445 4446 cfg=apic_read(APIC_ICR2); 4447 cfg&=0x00FFFFFF; 4448 /* Target chip */ 4449 apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i)); 4450 cfg=apic_read(APIC_ICR); 4451 /* Clear bits */ 4452 cfg&=~0xCDFFF; 4453 cfg |= (APIC_DEST_LEVELTRIG | APIC_DEST_ASSERT | 4454 APIC_DEST_DM_INIT); 4455 /* Send IPI */ 4456 apic_write(APIC_ICR, cfg); 4457 4458 udelay(200); 4459 SMP_PRINTK(("Deasserting INIT.\n")); 4460 4461 cfg=apic_read(APIC_ICR2); 4462 cfg&=0x00FFFFFF; 4463 /* Target chip */ 4464 apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i)); 4465 cfg=apic_read(APIC_ICR); 4466 /* Clear bits */ 4467 cfg&=~0xCDFFF; 4468 cfg |= (APIC_DEST_LEVELTRIG | APIC_DEST_DM_INIT); 4469 /* Send IPI */ 4470 apic_write(APIC_ICR, cfg); 4471 4472 /* Should we send STARTUP IPIs? 4473 * 4474 * Determine this based on the APIC version. If we 4475 * don't have an integrated APIC, don't send the 4476 * STARTUP IPIs. */ 4477 4478 if ( apic_version[i] & 0xF0 ) 4479 num_starts = 2; 4480 else 4481 num_starts = 0; 4482 4483 /* Run STARTUP IPI loop. */ 4484 4485 for (j = 1; !(send_status || accept_status) 4486 && (j <= num_starts) ; j++) 4487 { 4488 SMP_PRINTK(("Sending STARTUP #%d.\n",j)); 4489 apic_write(APIC_ESR, 0); 4490 SMP_PRINTK(("After apic_write.\n")); 4491 4492 /* STARTUP IPI */ 4493 4494 cfg=apic_read(APIC_ICR2); 4495 cfg&=0x00FFFFFF; 4496 /* Target chip */ 4497 apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(i)); 4498 cfg=apic_read(APIC_ICR); 4499 /* Clear bits */ 4500 cfg&=~0xCDFFF; 4501 /* Boot on the stack */ 4502 cfg |= (APIC_DEST_DM_STARTUP | (start_eip >> 12)); 4503 SMP_PRINTK(("Before start apic_write.\n")); 4504 /* Kick the second */ 4505 apic_write(APIC_ICR, cfg); 4506 4507 SMP_PRINTK(("Startup point 1.\n")); 4508 4509 timeout = 0; 4510 SMP_PRINTK(("Waiting for send to finish...\n")); 4511 do { 4512 SMP_PRINTK(("+")); 4513 udelay(100); 4514 send_status = apic_read(APIC_ICR) & 0x1000; 4515 } while (send_status && (timeout++ < 1000)); 4516 4517 /* Give the other CPU some time to accept the IPI. */ 4518 udelay(200); 4519 accept_status = (apic_read(APIC_ESR) & 0xEF); 4520 } 4521 SMP_PRINTK(("After Startup.\n")); 4522 4523 if (send_status) /* APIC never delivered?? */ 4524 printk("APIC never delivered???\n"); 4525 if (accept_status) /* Send accept error */ 4526 printk("APIC delivery error (%lx).\n",accept_status); 4527 4528 if ( !(send_status || accept_status) ) 4529 { 4530 /* allow APs to start initializing. */ 4531 SMP_PRINTK(("Before Callout %d.\n", i)); 4532 set_bit(i, (unsigned long *)&cpu_callout_map[0]); 4533 SMP_PRINTK(("After Callout %d.\n", i)); 4534 4535 for(timeout=0;timeout<50000;timeout++) 4536 { 4537 if (cpu_callin_map[0]&(1< cpucount+1)) 4727 { 4728 do_boot_cpu(i); 4729 } 4730 4731 /* Make sure we unmap all failed CPUs */ 4732 4733 if (cpu_number_map[i] == -1 && 4734 (cpu_online_map & (1 << i))) { 4735 printk("CPU #%d not responding. " 4736 "Removing from cpu_online_map.\n", i); 4737 cpu_online_map &= ~(1 << i); 4738 } 4739 } 4740 4741 /* Cleanup possible dangling ends... */ 4742 #ifndef CONFIG_VISWS 4743 { 4744 unsigned long cfg; 4745 4746 /* Install writable page 0 entry. */ 4747 cfg = pg0[0]; 4748 /* writeable, present, addr 0 */ 4749 pg0[0] = _PAGE_RW | _PAGE_PRESENT; 4750 local_flush_tlb(); 4751 4752 /* Paranoid: Set warm reset code and vector here back 4753 * to default values. */ 4754 CMOS_WRITE(0, 0xf); 4755 4756 *((volatile long *) phys_to_virt(0x467)) = 0; 4757 4758 /* Restore old page 0 entry. */ 4759 pg0[0] = cfg; 4760 local_flush_tlb(); 4761 } 4762 #endif 4763 4764 /* Allow the user to impress friends. */ 4765 SMP_PRINTK(("Before bogomips.\n")); 4766 if (cpucount==0) 4767 { 4768 printk(KERN_ERR 4769 "Error: only one processor found.\n"); 4770 cpu_online_map = (1<CPU IPIs and self-IPIs too.*/ 4811 4812 4813 /* Silly serialization to work around CPU bug in P5s. We 4814 * can safely turn it off on a 686. */ 4815 #ifdef CONFIG_X86_GOOD_APIC 4816 # define FORCE_APIC_SERIALIZATION 0 4817 #else 4818 # define FORCE_APIC_SERIALIZATION 1 4819 #endif 4820 4821 static unsigned int cached_APIC_ICR; 4822 static unsigned int cached_APIC_ICR2; 4823 4824 /* Caches reserved bits, APIC reads are (mildly) 4825 * expensive and force otherwise unnecessary CPU 4826 * synchronization. (We could cache other APIC registers 4827 * too, but these are the main ones used in RL.) */ 4828 #define slow_ICR (apic_read(APIC_ICR) & ~0xFDFFF) 4829 #define slow_ICR2 (apic_read(APIC_ICR2) & 0x00FFFFFF) 4830 4831 void cache_APIC_registers (void) 4832 { 4833 cached_APIC_ICR = slow_ICR; 4834 cached_APIC_ICR2 = slow_ICR2; 4835 mb(); 4836 } 4837 4838 static inline unsigned int __get_ICR (void) 4839 { 4840 #if FORCE_APIC_SERIALIZATION 4841 /* Wait for the APIC to become ready - this should 4842 * never occur. It's a debugging check really. */ 4843 int count = 0; 4844 unsigned int cfg; 4845 4846 while (count < 1000) 4847 { 4848 cfg = slow_ICR; 4849 if (!(cfg&(1<<12))) { 4850 if (count) 4851 atomic_add(count, (atomic_t*)&ipi_count); 4852 return cfg; 4853 } 4854 count++; 4855 udelay(10); 4856 } 4857 printk("CPU #%d: previous IPI still not cleared " 4858 "after 10mS\n", smp_processor_id()); 4859 return cfg; 4860 #else 4861 return cached_APIC_ICR; 4862 #endif 4863 } 4864 4865 static inline unsigned int __get_ICR2 (void) 4866 { 4867 #if FORCE_APIC_SERIALIZATION 4868 return slow_ICR2; 4869 #else 4870 return cached_APIC_ICR2; 4871 #endif 4872 } 4873 4874 static inline int __prepare_ICR (unsigned int shortcut, 4875 int vector) 4876 { 4877 unsigned int cfg; 4878 4879 cfg = __get_ICR(); 4880 cfg |= APIC_DEST_DM_FIXED|shortcut|vector; 4881 4882 return cfg; 4883 } 4884 4885 static inline int __prepare_ICR2 (unsigned int dest) 4886 { 4887 unsigned int cfg; 4888 4889 cfg = __get_ICR2(); 4890 cfg |= SET_APIC_DEST_FIELD(dest); 4891 4892 return cfg; 4893 } 4894 4895 static inline void 4896 __send_IPI_shortcut(unsigned int shortcut, int vector) 4897 { 4898 unsigned int cfg; 4899 /* Subtle. In the case of the 'never do double writes' 4900 * workaround we have to lock out interrupts to be 4901 * safe. Otherwise it's just one single atomic write to 4902 * the APIC, no need for cli/sti. */ 4903 #if FORCE_APIC_SERIALIZATION 4904 unsigned long flags; 4905 4906 __save_flags(flags); 4907 __cli(); 4908 #endif 4909 4910 /* No need to touch the target chip field */ 4911 4912 cfg = __prepare_ICR(shortcut, vector); 4913 4914 /* Send the IPI. The write to APIC_ICR 4915 * fires this off. */ 4916 apic_write(APIC_ICR, cfg); 4917 #if FORCE_APIC_SERIALIZATION 4918 __restore_flags(flags); 4919 #endif 4920 } 4921 4922 static inline void send_IPI_allbutself(int vector) 4923 { 4924 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector); 4925 } 4926 4927 static inline void send_IPI_all(int vector) 4928 { 4929 __send_IPI_shortcut(APIC_DEST_ALLINC, vector); 4930 } 4931 4932 void send_IPI_self(int vector) 4933 { 4934 __send_IPI_shortcut(APIC_DEST_SELF, vector); 4935 } 4936 4937 static inline void send_IPI_single(int dest, int vector) 4938 { 4939 unsigned long cfg; 4940 #if FORCE_APIC_SERIALIZATION 4941 unsigned long flags; 4942 4943 __save_flags(flags); 4944 __cli(); 4945 #endif 4946 4947 /* prepare target chip field */ 4948 4949 cfg = __prepare_ICR2(dest); 4950 apic_write(APIC_ICR2, cfg); 4951 4952 /* program the ICR*/ 4953 cfg = __prepare_ICR(0, vector); 4954 4955 /* Send the IPI. The write to APIC_ICR fires this off. 4956 */ 4957 apic_write(APIC_ICR, cfg); 4958 #if FORCE_APIC_SERIALIZATION 4959 __restore_flags(flags); 4960 #endif 4961 } 4962 4963 /* This is fraught with deadlocks. Probably the situation 4964 * is not that bad as in the early days of SMP, so we 4965 * might ease some of the paranoia here. */ 4966 4967 void smp_flush_tlb(void) 4968 { 4969 int cpu = smp_processor_id(); 4970 int stuck; 4971 unsigned long flags; 4972 4973 /* it's important that we do not generate any APIC 4974 * traffic until the AP CPUs have booted up! */ 4975 if (cpu_online_map) { 4976 /* The assignment is safe because it's volatile so 4977 * the compiler cannot reorder it, because the i586 4978 * has strict memory ordering and because only the 4979 * kernel lock holder may issue a tlb flush. If you 4980 * break any one of those three change this to an 4981 * atomic bus locked or. */ 4982 4983 smp_invalidate_needed = cpu_online_map; 4984 4985 /* Processors spinning on some lock with IRQs 4986 * disabled will see this IRQ late. The 4987 * smp_invalidate_needed map will ensure they don't 4988 * do a spurious flush tlb or miss one. */ 4989 4990 __save_flags(flags); 4991 __cli(); 4992 4993 send_IPI_allbutself(INVALIDATE_TLB_VECTOR); 4994 4995 /* Spin waiting for completion */ 4996 stuck = 50000000; 4997 while (smp_invalidate_needed) { 4998 /* Take care of "crossing" invalidates */ 4999 if (test_bit(cpu, &smp_invalidate_needed)) 5000 clear_bit(cpu, &smp_invalidate_needed); 5001 --stuck; 5002 if (!stuck) { 5003 printk("stuck on TLB IPI wait (CPU#%d)\n",cpu); 5004 break; 5005 } 5006 } 5007 __restore_flags(flags); 5008 } 5009 5010 /* Flush the local TLB */ 5011 local_flush_tlb(); 5012 } 5013 5014 5015 /* this function sends a 'reschedule' IPI to another CPU. 5016 * it goes straight through and wastes no time 5017 * serializing anything. Worst case is that we lose a 5018 * reschedule ... */ 5019 void smp_send_reschedule(int cpu) 5020 { 5021 send_IPI_single(cpu, RESCHEDULE_VECTOR); 5022 } 5023 5024 /* this function sends a 'stop' IPI to all other CPUs in 5025 * the system. it goes straight through. */ 5026 void smp_send_stop(void) 5027 { 5028 send_IPI_allbutself(STOP_CPU_VECTOR); 5029 } 5030 5031 /* this function sends an 'reload MTRR state' IPI to all 5032 * other CPUs in the system. it goes straight through, 5033 * completion processing is done on the mttr.c level. */ 5034 void smp_send_mtrr(void) 5035 { 5036 send_IPI_allbutself(MTRR_CHANGE_VECTOR); 5037 } 5038 5039 /* Local timer interrupt handler. It does both profiling 5040 * and process statistics/rescheduling. 5041 * 5042 * We do profiling in every local tick, 5043 * statistics/rescheduling happen only every 'profiling 5044 * multiplier' ticks. The default multiplier is 1 and it 5045 * can be changed by writing the new multiplier value 5046 * into /proc/profile. */ 5047 void smp_local_timer_interrupt(struct pt_regs * regs) 5048 { 5049 int cpu = smp_processor_id(); 5050 5051 /* The profiling function is SMP safe. (nothing can 5052 * mess around with "current", and the profiling 5053 * counters are updated with atomic operations). This 5054 * is especially useful with a profiling 5055 * multiplier != 1 */ 5056 if (!user_mode(regs)) 5057 x86_do_profile(regs->eip); 5058 5059 if (!--prof_counter[cpu]) { 5060 int user=0,system=0; 5061 struct task_struct * p = current; 5062 5063 /* After doing the above, we need to make like a 5064 * normal interrupt - otherwise timer interrupts 5065 * ignore the global interrupt lock, which is the 5066 * WrongThing (tm) to do. */ 5067 5068 if (user_mode(regs)) 5069 user=1; 5070 else 5071 system=1; 5072 5073 irq_enter(cpu, 0); 5074 if (p->pid) { 5075 update_one_process(p, 1, user, system, cpu); 5076 5077 p->counter -= 1; 5078 if (p->counter < 0) { 5079 p->counter = 0; 5080 p->need_resched = 1; 5081 } 5082 if (p->priority < DEF_PRIORITY) { 5083 kstat.cpu_nice += user; 5084 kstat.per_cpu_nice[cpu] += user; 5085 } else { 5086 kstat.cpu_user += user; 5087 kstat.per_cpu_user[cpu] += user; 5088 } 5089 5090 kstat.cpu_system += system; 5091 kstat.per_cpu_system[cpu] += system; 5092 5093 } 5094 prof_counter[cpu]=prof_multiplier[cpu]; 5095 irq_exit(cpu, 0); 5096 } 5097 5098 /* We take the 'long' return path, and there every 5099 * subsystem grabs the apropriate locks (kernel lock/ 5100 * irq lock). 5101 * 5102 * we might want to decouple profiling from the 'long 5103 * path', and do the profiling totally in assembly. 5104 * 5105 * Currently this isn't too much of an issue 5106 * (performance wise), we can take more than 100K local 5107 * irqs per second on a 100 MHz P5. */ 5108 } 5109 5110 /* Local APIC timer interrupt. This is the most natural 5111 * way for doing local interrupts, but local timer 5112 * interrupts can be emulated by broadcast interrupts 5113 * too. [in case the hw doesnt support APIC timers] 5114 * 5115 * [ if a single-CPU system runs an SMP kernel then we 5116 * call the local interrupt as well. Thus we cannot 5117 * inline the local irq ... ] */ 5118 void smp_apic_timer_interrupt(struct pt_regs * regs) 5119 { 5120 /* NOTE! We'd better ACK the irq immediately, because 5121 * timer handling can be slow, and we want to be able 5122 * to accept NMI tlb invalidates during this time. */ 5123 ack_APIC_irq(); 5124 smp_local_timer_interrupt(regs); 5125 } 5126 5127 /* Reschedule call back. Nothing to do, all the work is 5128 * done automatically when we return from the interrupt. 5129 */ 5130 asmlinkage void smp_reschedule_interrupt(void) 5131 { 5132 ack_APIC_irq(); 5133 } 5134 5135 /* Invalidate call-back */ 5136 asmlinkage void smp_invalidate_interrupt(void) 5137 { 5138 if (test_and_clear_bit(smp_processor_id(), 5139 &smp_invalidate_needed)) 5140 local_flush_tlb(); 5141 5142 ack_APIC_irq(); 5143 } 5144 5145 static void stop_this_cpu (void) 5146 { 5147 /* Remove this CPU: */ 5148 clear_bit(smp_processor_id(), &cpu_online_map); 5149 5150 if (cpu_data[smp_processor_id()].hlt_works_ok) 5151 for(;;) __asm__("hlt"); 5152 for (;;); 5153 } 5154 5155 /* CPU halt call-back */ 5156 asmlinkage void smp_stop_cpu_interrupt(void) 5157 { 5158 stop_this_cpu(); 5159 } 5160 5161 void (*mtrr_hook) (void) = NULL; 5162 5163 asmlinkage void smp_mtrr_interrupt(void) 5164 { 5165 ack_APIC_irq(); 5166 if (mtrr_hook) (*mtrr_hook)(); 5167 } 5168 5169 /* This interrupt should _never_ happen with our APIC/SMP 5170 * architecture */ 5171 asmlinkage void smp_spurious_interrupt(void) 5172 { 5173 ack_APIC_irq(); 5174 /* see sw-dev-man vol 3, chapter 7.4.13.5 */ 5175 printk("spurious APIC interrupt on CPU#%d, " 5176 "should never happen.\n", smp_processor_id()); 5177 } 5178 5179 /* This part sets up the APIC 32 bit clock in LVTT1, with 5180 * HZ interrupts per second. We assume that the caller 5181 * has already set up the local APIC. 5182 * 5183 * The APIC timer is not exactly sync with the external 5184 * timer chip, it closely follows bus clocks. */ 5185 5186 /* The timer chip is already set up at HZ interrupts per 5187 * second here, but we do not accept timer interrupts 5188 * yet. We only allow the BP to calibrate. */ 5189 static unsigned int __init get_8254_timer_count(void) 5190 { 5191 unsigned int count; 5192 5193 outb_p(0x00, 0x43); 5194 count = inb_p(0x40); 5195 count |= inb_p(0x40) << 8; 5196 5197 return count; 5198 } 5199 5200 /* This function sets up the local APIC timer, with a 5201 * timeout of 'clocks' APIC bus clock. During calibration 5202 * we actually call this function twice, once with a 5203 * bogus timeout value, second time for real. The other 5204 * (noncalibrating) CPUs call this function only once, 5205 * with the real value. 5206 * 5207 * We are strictly in irqs off mode here, as we do not 5208 * want to get an APIC interrupt go off accidentally. 5209 * 5210 * We do reads before writes even if unnecessary, to get 5211 * around the APIC double write bug. */ 5212 #define APIC_DIVISOR 16 5213 5214 void setup_APIC_timer(unsigned int clocks) 5215 { 5216 unsigned long lvtt1_value; 5217 unsigned int tmp_value; 5218 5219 /* Unfortunately the local APIC timer cannot be set up 5220 * into NMI mode. With the IO APIC we can re-route the 5221 * external timer interrupt and broadcast it as an NMI 5222 * to all CPUs, so no pain. */ 5223 tmp_value = apic_read(APIC_LVTT); 5224 lvtt1_value = APIC_LVT_TIMER_PERIODIC | 5225 LOCAL_TIMER_VECTOR; 5226 apic_write(APIC_LVTT , lvtt1_value); 5227 5228 /* Divide PICLK by 16 */ 5229 tmp_value = apic_read(APIC_TDCR); 5230 apic_write(APIC_TDCR , (tmp_value & ~APIC_TDR_DIV_1 ) 5231 | APIC_TDR_DIV_16); 5232 5233 tmp_value = apic_read(APIC_TMICT); 5234 apic_write(APIC_TMICT, clocks/APIC_DIVISOR); 5235 } 5236 5237 void __init wait_8254_wraparound(void) 5238 { 5239 unsigned int curr_count, prev_count=~0; 5240 int delta; 5241 5242 curr_count = get_8254_timer_count(); 5243 5244 do { 5245 prev_count = curr_count; 5246 curr_count = get_8254_timer_count(); 5247 delta = curr_count-prev_count; 5248 5249 /* This limit for delta seems arbitrary, but it 5250 * isn't, it's slightly above the level of error a 5251 * buggy Mercury/Neptune chipset timer can cause. */ 5252 } while (delta<300); 5253 } 5254 5255 /* In this function we calibrate APIC bus clocks to the 5256 * external timer. Unfortunately we cannot use jiffies 5257 * and the timer irq to calibrate, since some later 5258 * bootup code depends on getting the first irq? Ugh. 5259 * 5260 * We want to do the calibration only once since we want 5261 * to have local timer irqs syncron. CPUs connected by 5262 * the same APIC bus have the very same bus frequency. 5263 * And we want to have irqs off anyways, no accidental 5264 * APIC irq that way. */ 5265 5266 int __init calibrate_APIC_clock(void) 5267 { 5268 unsigned long long t1,t2; 5269 long tt1,tt2; 5270 long calibration_result; 5271 int i; 5272 5273 printk("calibrating APIC timer ... "); 5274 5275 /* Put whatever arbitrary (but long enough) timeout 5276 * value into the APIC clock, we just want to get the 5277 * counter running for calibration. */ 5278 setup_APIC_timer(1000000000); 5279 5280 /* The timer chip counts down to zero. Let's wait for a 5281 * wraparound to start exact measurement: (the current 5282 * tick might have been already half done) */ 5283 5284 wait_8254_wraparound (); 5285 5286 /* We wrapped around just now. Let's start: */ 5287 READ_TSC(t1); 5288 tt1=apic_read(APIC_TMCCT); 5289 5290 #define LOOPS (HZ/10) 5291 /* Let's wait LOOPS wraprounds: */ 5292 for (i=0; i= 2000, 5402 * new mktime 5403 * 1995-03-26 Markus Kuhn 5404 * fixed 500 ms bug at call to set_rtc_mmss, fixed 5405 * DS12887 precision CMOS clock update 5406 * 1996-05-03 Ingo Molnar 5407 * fixed time warps in 5408 * do_[slow|fast]_gettimeoffset() 5409 * 1997-09-10 Updated NTP code according to technical 5410 * memorandum Jan '96 "A Kernel Model for Precision 5411 * Timekeeping" by Dave Mills 5412 * 1998-09-05 (Various) More robust 5413 * do_fast_gettimeoffset() algorithm implemented 5414 * (works with APM, Cyrix 6x86MX and Centaur C6), 5415 * monotonic gettimeofday() with 5416 * fast_get_timeoffset(), drift-proof precision TSC 5417 * calibration on boot (C. Scott Ananian 5418 * , Andrew D. Balsa 5419 * , Philip Gladstone 5420 * ; ported from 2.0.35 Jumbo-9 5421 * by Michael Krause ). 5422 * 1998-12-16 Andrea Arcangeli 5423 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday 5424 * was missing 1 jiffy because was not accounting 5425 * lost_ticks. 5426 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli 5427 * Fixed a xtime SMP race (we need the xtime_lock rw 5428 * spinlock to serialize accesses to 5429 * xtime/lost_ticks). 5430 */ 5431 5432 #include 5433 #include 5434 #include 5435 #include 5436 #include 5437 #include 5438 #include 5439 #include 5440 #include 5441 #include 5442 #include 5443 5444 #include 5445 #include 5446 #include 5447 #include 5448 #include 5449 5450 #include 5451 #include 5452 #include 5453 5454 #include 5455 #include 5456 5457 /* for x86_do_profile() */ 5458 #include "irq.h" 5459 5460 5461 /* Detected as we calibrate the TSC */ 5462 unsigned long cpu_hz; 5463 5464 /* Number of usecs that the last interrupt was delayed */ 5465 static int delay_at_last_interrupt; 5466 5467 /* lsb 32 bits of Time Stamp Counter */ 5468 static unsigned long last_tsc_low; 5469 5470 /* Cached *multiplier* to convert TSC counts to 5471 * microseconds. (see the equation below). Equal to 5472 * 2^32 * (1 / (clocks per usec) ). Initialized in 5473 * time_init. */ 5474 static unsigned long fast_gettimeoffset_quotient=0; 5475 5476 extern rwlock_t xtime_lock; 5477 5478 static inline unsigned long do_fast_gettimeoffset(void) 5479 { 5480 register unsigned long eax asm("ax"); 5481 register unsigned long edx asm("dx"); 5482 5483 /* Read the Time Stamp Counter */ 5484 __asm__("rdtsc" 5485 :"=a" (eax), "=d" (edx)); 5486 5487 /* .. relative to previous jiffy (32 bits is enough) */ 5488 eax -= last_tsc_low; /* tsc_low delta */ 5489 5490 /* Time offset 5491 * = (tsc_low delta) * fast_gettimeoffset_quotient 5492 * = (tsc_low delta) * (usecs_per_clock) 5493 * = (tsc_low delta) * (usecs_per_jiffy / 5494 * clocks_per_jiffy) 5495 * Using a mull instead of a divl saves up to 31 clock 5496 * cycles in the critical path. */ 5497 5498 __asm__("mull %2" 5499 :"=a" (eax), "=d" (edx) 5500 :"g" (fast_gettimeoffset_quotient), 5501 "0" (eax)); 5502 5503 /* our adjusted time offset in microseconds */ 5504 return delay_at_last_interrupt + edx; 5505 } 5506 5507 #define TICK_SIZE tick 5508 5509 #ifndef CONFIG_X86_TSC 5510 5511 /* This function must be called with interrupts disabled 5512 * It was inspired by Steve McCanne's microtime-i386 for 5513 * BSD. -- jrs 5514 * 5515 * However, the pc-audio speaker driver changes the 5516 * divisor so that it gets interrupted rather more often 5517 * - it loads 64 into the counter rather than 11932! This 5518 * has an adverse impact on do_gettimeoffset() -- it 5519 * stops working! What is also not good is that the 5520 * interval that our timer function gets called is no 5521 * longer 10.0002 ms, but 9.9767 ms. To get around this 5522 * would require using a different timing source. Maybe 5523 * someone could use the RTC - I know that this can 5524 * interrupt at frequencies ranging from 8192Hz to 5525 * 2Hz. If I had the energy, I'd somehow fix it so that 5526 * at startup, the timer code in sched.c would select 5527 * using either the RTC or the 8253 timer. The decision 5528 * would be based on whether there was any other device 5529 * around that needed to trample on the 8253. I'd set up 5530 * the RTC to interrupt at 1024 Hz, and then do some 5531 * jiggery to have a version of do_timer that advanced 5532 * the clock by 1/1024 s. Every time that reached over 5533 * 1/100 of a second, then do all the old code. If the 5534 * time was kept correct then do_gettimeoffset could just 5535 * return 0 - there is no low order divider that can be 5536 * accessed. 5537 * 5538 * Ideally, you would be able to use the RTC for the 5539 * speaker driver, but it appears that the speaker driver 5540 * really needs interrupt more often than every 120 us or 5541 * so. 5542 * 5543 * Anyway, this needs more thought.... pjsg (1993-08-28) 5544 * 5545 * If you are really that interested, you should be 5546 * reading comp.protocols.time.ntp! */ 5547 static unsigned long do_slow_gettimeoffset(void) 5548 { 5549 int count; 5550 5551 /* for the first call after boot */ 5552 static int count_p = LATCH; 5553 static unsigned long jiffies_p = 0; 5554 5555 /* cache volatile jiffies temporarily; we have IRQs 5556 * turned off. */ 5557 unsigned long jiffies_t; 5558 5559 /* timer count may underflow right here */ 5560 outb_p(0x00, 0x43); /* latch the count ASAP */ 5561 5562 count = inb_p(0x40); /* read the latched count */ 5563 5564 /* We do this guaranteed double memory access instead 5565 * of a _p postfix in the previous port access. Wheee, 5566 * hackady hack */ 5567 jiffies_t = jiffies; 5568 5569 count |= inb_p(0x40) << 8; 5570 5571 /* avoiding timer inconsistencies (they are rare, but 5572 * they happen)... there are two kinds of problems 5573 * that must be avoided here: 1. the timer counter 5574 * underflows 2. hardware problem with the timer, not 5575 * giving us continuous time, the counter does small 5576 * "jumps" upwards on some Pentium systems, (see c't 5577 * 95/10 page 335 for Neptune bug.) */ 5578 5579 /* you can safely undefine this if you don't have the 5580 * Neptune chipset */ 5581 5582 #define BUGGY_NEPTUN_TIMER 5583 5584 if( jiffies_t == jiffies_p ) { 5585 if( count > count_p ) { 5586 /* the nutcase */ 5587 5588 outb_p(0x0A, 0x20); 5589 5590 /* assumption about timer being IRQ1 */ 5591 if( inb(0x20) & 0x01 ) { 5592 /* We cannot detect lost timer interrupts ... 5593 * well, that's why we call them lost, don't we? 5594 * :) [hmm, on the Pentium and Alpha we can 5595 * ... sort of] */ 5596 count -= LATCH; 5597 } else { 5598 #ifdef BUGGY_NEPTUN_TIMER 5599 /* for the Neptun bug we know that the 'latch' 5600 * command doesnt latch the high and low value of 5601 * the counter atomically. Thus we have to 5602 * substract 256 from the counter ... funny, 5603 * isn't it? :) */ 5604 5605 count -= 256; 5606 #else 5607 printk("do_slow_gettimeoffset(): " 5608 "hardware timer problem?\n"); 5609 #endif 5610 } 5611 } 5612 } else 5613 jiffies_p = jiffies_t; 5614 5615 count_p = count; 5616 5617 count = ((LATCH-1) - count) * TICK_SIZE; 5618 count = (count + LATCH/2) / LATCH; 5619 5620 return count; 5621 } 5622 5623 static unsigned long (*do_gettimeoffset)(void) = 5624 do_slow_gettimeoffset; 5625 5626 #else 5627 5628 #define do_gettimeoffset() do_fast_gettimeoffset() 5629 5630 #endif 5631 5632 /* This version of gettimeofday has microsecond 5633 * resolution and better than microsecond precision on 5634 * fast x86 machines with TSC. */ 5635 void do_gettimeofday(struct timeval *tv) 5636 { 5637 extern volatile unsigned long lost_ticks; 5638 unsigned long flags; 5639 unsigned long usec, sec; 5640 5641 read_lock_irqsave(&xtime_lock, flags); 5642 usec = do_gettimeoffset(); 5643 { 5644 unsigned long lost = lost_ticks; 5645 if (lost) 5646 usec += lost * (1000000 / HZ); 5647 } 5648 sec = xtime.tv_sec; 5649 usec += xtime.tv_usec; 5650 read_unlock_irqrestore(&xtime_lock, flags); 5651 5652 while (usec >= 1000000) { 5653 usec -= 1000000; 5654 sec++; 5655 } 5656 5657 tv->tv_sec = sec; 5658 tv->tv_usec = usec; 5659 } 5660 5661 void do_settimeofday(struct timeval *tv) 5662 { 5663 write_lock_irq(&xtime_lock); 5664 /* This is revolting. We need to set the xtime.tv_usec 5665 * correctly. However, the value in this location is 5666 * is value at the last tick. 5667 * Discover what correction gettimeofday 5668 * would have done, and then undo it! 5669 */ 5670 tv->tv_usec -= do_gettimeoffset(); 5671 5672 while (tv->tv_usec < 0) { 5673 tv->tv_usec += 1000000; 5674 tv->tv_sec--; 5675 } 5676 5677 xtime = *tv; 5678 time_adjust = 0; /* stop active adjtime() */ 5679 time_status |= STA_UNSYNC; 5680 time_maxerror = NTP_PHASE_LIMIT; 5681 time_esterror = NTP_PHASE_LIMIT; 5682 write_unlock_irq(&xtime_lock); 5683 } 5684 5685 /* In order to set the CMOS clock precisely, set_rtc_mmss 5686 * has to be called 500 ms after the second nowtime has 5687 * started, because when nowtime is written into the 5688 * registers of the CMOS clock, it will jump to the next 5689 * second precisely 500 ms later. Check the Motorola 5690 * MC146818A or Dallas DS12887 data sheet for details. 5691 * 5692 * BUG: This routine does not handle hour overflow 5693 * properly; it just sets the minutes. Usually you'll 5694 * only notice that after reboot! */ 5695 static int set_rtc_mmss(unsigned long nowtime) 5696 { 5697 int retval = 0; 5698 int real_seconds, real_minutes, cmos_minutes; 5699 unsigned char save_control, save_freq_select; 5700 5701 /* tell the clock it's being set */ 5702 save_control = CMOS_READ(RTC_CONTROL); 5703 CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); 5704 5705 /* stop and reset prescaler */ 5706 save_freq_select = CMOS_READ(RTC_FREQ_SELECT); 5707 CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), 5708 RTC_FREQ_SELECT); 5709 5710 cmos_minutes = CMOS_READ(RTC_MINUTES); 5711 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) 5712 BCD_TO_BIN(cmos_minutes); 5713 5714 /* since we're only adjusting minutes and seconds, 5715 * don't interfere with hour overflow. This avoids 5716 * messing with unknown time zones but requires your 5717 * RTC not to be off by more than 15 minutes */ 5718 real_seconds = nowtime % 60; 5719 real_minutes = nowtime / 60; 5720 if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1) 5721 real_minutes += 30; /* correct for 1/2-hour tzone */ 5722 real_minutes %= 60; 5723 5724 if (abs(real_minutes - cmos_minutes) < 30) { 5725 if (!(save_control & RTC_DM_BINARY) || 5726 RTC_ALWAYS_BCD) { 5727 BIN_TO_BCD(real_seconds); 5728 BIN_TO_BCD(real_minutes); 5729 } 5730 CMOS_WRITE(real_seconds,RTC_SECONDS); 5731 CMOS_WRITE(real_minutes,RTC_MINUTES); 5732 } else { 5733 printk(KERN_WARNING 5734 "set_rtc_mmss: can't update from %d to %d\n", 5735 cmos_minutes, real_minutes); 5736 retval = -1; 5737 } 5738 5739 /* The following flags have to be released exactly in 5740 * this order, otherwise the DS12887 (popular MC146818A 5741 * clone with integrated battery and quartz) will not 5742 * reset the oscillator and will not update precisely 5743 * 500 ms later. You won't find this mentioned in the 5744 * Dallas Semiconductor data sheets, but who believes 5745 * data sheets anyway ... -- Markus Kuhn */ 5746 CMOS_WRITE(save_control, RTC_CONTROL); 5747 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); 5748 5749 return retval; 5750 } 5751 5752 /* last time the cmos clock got updated */ 5753 static long last_rtc_update = 0; 5754 5755 /* timer_interrupt() needs to keep up the real-time 5756 * clock, as well as call the "do_timer()" routine every 5757 * clocktick */ 5758 static inline void do_timer_interrupt(int irq, 5759 void *dev_id, struct pt_regs *regs) 5760 { 5761 #ifdef CONFIG_VISWS 5762 /* Clear the interrupt */ 5763 co_cpu_write(CO_CPU_STAT, 5764 co_cpu_read(CO_CPU_STAT) & ~CO_STAT_TIMEINTR); 5765 #endif 5766 do_timer(regs); 5767 /* In the SMP case we use the local APIC timer interrupt 5768 * to do the profiling, except when we simulate SMP mode 5769 * on a uniprocessor system, in that case we have to call 5770 * the local interrupt handler. */ 5771 #ifndef __SMP__ 5772 if (!user_mode(regs)) 5773 x86_do_profile(regs->eip); 5774 #else 5775 if (!smp_found_config) 5776 smp_local_timer_interrupt(regs); 5777 #endif 5778 5779 /* If we have an externally synchronized Linux clock, 5780 * then update CMOS clock accordingly every ~11 5781 * minutes. Set_rtc_mmss() has to be called as close as 5782 * possible to 500 ms before the new second starts. */ 5783 if ((time_status & STA_UNSYNC) == 0 && 5784 xtime.tv_sec > last_rtc_update + 660 && 5785 xtime.tv_usec >= 500000 - ((unsigned) tick) / 2 && 5786 xtime.tv_usec <= 500000 + ((unsigned) tick) / 2) { 5787 if (set_rtc_mmss(xtime.tv_sec) == 0) 5788 last_rtc_update = xtime.tv_sec; 5789 else /* do it again in 60 s */ 5790 last_rtc_update = xtime.tv_sec - 600; 5791 } 5792 5793 #ifdef CONFIG_MCA 5794 if( MCA_bus ) { 5795 /* The PS/2 uses level-triggered interrupts. You * 5796 * can't turn them off, nor would you want to (any 5797 * attempt to enable edge-triggered interrupts 5798 * usually gets intercepted by a special hardware 5799 * circuit). Hence we have to acknowledge the timer 5800 * interrupt. Through some incredibly stupid design 5801 * idea, the reset for IRQ 0 is done by setting the 5802 * high bit of the PPI port B (0x61). Note that some 5803 * PS/2s, notably the 55SX, work fine if this is 5804 * removed. */ 5805 irq = inb_p( 0x61 ); /* read the current state */ 5806 outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ 5807 } 5808 #endif 5809 } 5810 5811 static int use_tsc = 0; 5812 5813 /* This is the same as the above, except we _also_ save 5814 * the current Time Stamp Counter value at the time of 5815 * the timer interrupt, so that we later on can estimate 5816 * the time of day more exactly. */ 5817 static void timer_interrupt(int irq, void *dev_id, 5818 struct pt_regs *regs) 5819 { 5820 int count; 5821 5822 /* Here we are in the timer irq handler. We just have 5823 * irqs locally disabled but we don't know if the 5824 * timer_bh is running on the other CPU. We need to 5825 * avoid to SMP race with it. NOTE: we don' t need the 5826 * irq version of write_lock because as just said we 5827 * have irq locally disabled. -arca */ 5828 write_lock(&xtime_lock); 5829 5830 if (use_tsc) 5831 { 5832 /* It is important that these two operations happen 5833 * almost at the same time. We do the RDTSC stuff 5834 * first, since it's faster. To avoid any 5835 * inconsistencies, we need interrupts disabled 5836 * locally. */ 5837 5838 /* Interrupts are just disabled locally since the 5839 * timer irq has the SA_INTERRUPT flag set. -arca */ 5840 5841 /* read Pentium cycle counter */ 5842 __asm__("rdtsc" : "=a" (last_tsc_low) : : "edx"); 5843 5844 outb_p(0x00, 0x43); /* latch the count ASAP */ 5845 5846 count = inb_p(0x40); /* read the latched count */ 5847 count |= inb(0x40) << 8; 5848 5849 count = ((LATCH-1) - count) * TICK_SIZE; 5850 delay_at_last_interrupt = (count + LATCH/2) / LATCH; 5851 } 5852 5853 do_timer_interrupt(irq, NULL, regs); 5854 5855 write_unlock(&xtime_lock); 5856 5857 } 5858 5859 /* Converts Gregorian date to seconds since 1970-01-01 5860 * 00:00:00. Assumes input in normal date format, 5861 * i.e. 1980-12-31 23:59:59 => year=1980, mon=12, day=31, 5862 * hour=23, min=59, sec=59. 5863 * 5864 * [For the Julian calendar (which was used in Russia 5865 * before 1917, Britain & colonies before 1752, anywhere 5866 * else before 1582, and is still in use by some 5867 * communities) leave out the -year/100+year/400 terms, 5868 * and add 10.] 5869 * 5870 * This algorithm was first published by Gauss (I think). 5871 * 5872 * WARNING: this function will overflow on 2106-02-07 5873 * 06:28:16 on machines were long is 32-bit! (However, as 5874 * time_t is signed, we will already get problems at 5875 * other places on 2038-01-19 03:14:08) */ 5876 static inline unsigned long mktime( 5877 unsigned int year, unsigned int mon, 5878 unsigned int day, unsigned int hour, 5879 unsigned int min, unsigned int sec) 5880 { 5881 if (0 >= (int) (mon -= 2)) { /* 1..12 -> 11,12,1..10 */ 5882 mon += 12; /* Puts Feb last since it has leap day */ 5883 year -= 1; 5884 } 5885 return ((( 5886 (unsigned long)(year/4 - year/100 + year/400 + 5887 367*mon/12 + day) + year*365 - 719499 5888 )*24 + hour /* now have hours */ 5889 )*60 + min /* now have minutes */ 5890 )*60 + sec; /* finally seconds */ 5891 } 5892 5893 /* not static: needed by APM */ 5894 unsigned long get_cmos_time(void) 5895 { 5896 unsigned int year, mon, day, hour, min, sec; 5897 int i; 5898 5899 /* The Linux interpretation of the CMOS clock register 5900 * contents: When the Update-In-Progress (UIP) flag 5901 * goes from 1 to 0, the RTC registers show the second 5902 * which has precisely just started. Let's hope other 5903 * operating systems interpret the RTC the same way. */ 5904 /* read RTC exactly on falling edge of update flag */ 5905 /* may take up to 1 second... */ 5906 for (i = 0 ; i < 1000000 ; i++) 5907 if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) 5908 break; 5909 /* must try at least 2.228 ms */ 5910 for (i = 0 ; i < 1000000 ; i++) 5911 if (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) 5912 break; 5913 /* Isn't this overkill? 5914 * UIP above should guarantee consistency */ 5915 do { 5916 sec = CMOS_READ(RTC_SECONDS); 5917 min = CMOS_READ(RTC_MINUTES); 5918 hour = CMOS_READ(RTC_HOURS); 5919 day = CMOS_READ(RTC_DAY_OF_MONTH); 5920 mon = CMOS_READ(RTC_MONTH); 5921 year = CMOS_READ(RTC_YEAR); 5922 } while (sec != CMOS_READ(RTC_SECONDS)); 5923 if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || 5924 RTC_ALWAYS_BCD) { 5925 BCD_TO_BIN(sec); 5926 BCD_TO_BIN(min); 5927 BCD_TO_BIN(hour); 5928 BCD_TO_BIN(day); 5929 BCD_TO_BIN(mon); 5930 BCD_TO_BIN(year); 5931 } 5932 if ((year += 1900) < 1970) 5933 year += 100; 5934 return mktime(year, mon, day, hour, min, sec); 5935 } 5936 5937 static struct irqaction irq0 = 5938 { timer_interrupt, SA_INTERRUPT, 0, "timer", NULL, NULL}; 5939 5940 /* ------ Calibrate the TSC ------- 5941 * Return 2^32 * (1 / (TSC clocks per usec)) for 5942 * do_fast_gettimeoffset(). Too much 64-bit arithmetic 5943 * here to do this cleanly in C, and for accuracy's sake 5944 * we want to keep the overhead on the CTC speaker 5945 * (channel 2) output busy loop as low as possible. We 5946 * avoid reading the CTC registers directly because of 5947 * the awkward 8-bit access mechanism of the 82C54 5948 * device. */ 5949 5950 #define CALIBRATE_LATCH (5 * LATCH) 5951 #define CALIBRATE_TIME (5 * 1000020/HZ) 5952 5953 __initfunc(static unsigned long calibrate_tsc(void)) 5954 { 5955 /* Set the Gate high, disable speaker */ 5956 outb((inb(0x61) & ~0x02) | 0x01, 0x61); 5957 5958 /* Now let's take care of CTC channel 2 5959 * 5960 * Set the Gate high, program CTC channel 2 for mode 0, 5961 * (interrupt on terminal count mode), binary count, 5962 * load 5 * LATCH count, (LSB and MSB) to begin 5963 * countdown. */ 5964 outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */ 5965 outb(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */ 5966 outb(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */ 5967 5968 { 5969 unsigned long startlow, starthigh; 5970 unsigned long endlow, endhigh; 5971 unsigned long count; 5972 5973 __asm__ __volatile__("rdtsc":"=a" (startlow),"=d" 5974 (starthigh)); 5975 count = 0; 5976 do { 5977 count++; 5978 } while ((inb(0x61) & 0x20) == 0); 5979 __asm__ __volatile__("rdtsc":"=a" (endlow),"=d" 5980 (endhigh)); 5981 5982 last_tsc_low = endlow; 5983 5984 /* Error: ECTCNEVERSET */ 5985 if (count <= 1) 5986 goto bad_ctc; 5987 5988 /* 64-bit subtract - gcc just messes up with long 5989 * longs */ 5990 __asm__("subl %2,%0\n\t" 5991 "sbbl %3,%1" 5992 :"=a" (endlow), "=d" (endhigh) 5993 :"g" (startlow), "g" (starthigh), 5994 "0" (endlow), "1" (endhigh)); 5995 5996 /* Error: ECPUTOOFAST */ 5997 if (endhigh) 5998 goto bad_ctc; 5999 6000 /* Error: ECPUTOOSLOW */ 6001 if (endlow <= CALIBRATE_TIME) 6002 goto bad_ctc; 6003 6004 __asm__("divl %2" 6005 :"=a" (endlow), "=d" (endhigh) 6006 :"r" (endlow), "0" (0), "1" (CALIBRATE_TIME)); 6007 6008 return endlow; 6009 } 6010 6011 /* The CTC wasn't reliable: we got a hit on the very 6012 * first read, or the CPU was so fast/slow that the 6013 * quotient wouldn't fit in 32 bits.. */ 6014 bad_ctc: 6015 return 0; 6016 } 6017 6018 __initfunc(void time_init(void)) 6019 { 6020 xtime.tv_sec = get_cmos_time(); 6021 xtime.tv_usec = 0; 6022 6023 /* If we have APM enabled or the CPU clock speed is 6024 * variable (CPU stops clock on HLT or slows clock to 6025 * save power) then the TSC timestamps may diverge by up 6026 * to 1 jiffy from 'real time' but nothing will break. 6027 * The most frequent case is that the CPU is "woken" from 6028 * a halt state by the timer interrupt itself, so we get 6029 * 0 error. In the rare cases where a driver would "wake" 6030 * the CPU and request a timestamp, the maximum error is 6031 * < 1 jiffy. But timestamps are still perfectly ordered. 6032 * Note that the TSC counter will be reset if APM 6033 * suspends to disk; this won't break the kernel, though, 6034 * 'cuz we're smart. See arch/i386/kernel/apm.c. */ 6035 /* Firstly we have to do a CPU check for chips with a 6036 * potentially buggy TSC. At this point we haven't run 6037 * the ident/bugs checks so we must run this hook as it 6038 * may turn off the TSC flag. 6039 * 6040 * NOTE: this doesn't yet handle SMP 486 machines where 6041 * only some CPU's have a TSC. Thats never worked and 6042 * nobody has moaned if you have the only one in the 6043 * world - you fix it! */ 6044 6045 dodgy_tsc(); 6046 6047 if (boot_cpu_data.x86_capability & X86_FEATURE_TSC) { 6048 unsigned long tsc_quotient = calibrate_tsc(); 6049 if (tsc_quotient) { 6050 fast_gettimeoffset_quotient = tsc_quotient; 6051 use_tsc = 1; 6052 #ifndef do_gettimeoffset 6053 do_gettimeoffset = do_fast_gettimeoffset; 6054 #endif 6055 do_get_fast_time = do_gettimeofday; 6056 6057 /* report CPU clock rate in Hz. The formula is 6058 * (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = 6059 * clock/second. Our precision is about 100 ppm. */ 6060 { unsigned long eax=0, edx=1000000; 6061 __asm__("divl %2" 6062 :"=a" (cpu_hz), "=d" (edx) 6063 :"r" (tsc_quotient), 6064 "0" (eax), "1" (edx)); 6065 printk("Detected %ld Hz processor.\n", cpu_hz); 6066 } 6067 } 6068 } 6069 6070 #ifdef CONFIG_VISWS 6071 printk("Starting Cobalt Timer system clock\n"); 6072 6073 /* Set the countdown value */ 6074 co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ); 6075 6076 /* Start the timer */ 6077 co_cpu_write(CO_CPU_CTRL, 6078 co_cpu_read(CO_CPU_CTRL)|CO_CTRL_TIMERUN); 6079 6080 /* Enable (unmask) the timer interrupt */ 6081 co_cpu_write(CO_CPU_CTRL, 6082 co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); 6083 6084 /* Wire cpu IDT entry to s/w handler (and Cobalt APIC 6085 * to IDT) */ 6086 setup_x86_irq(CO_IRQ_TIMER, &irq0); 6087 #else 6088 setup_x86_irq(0, &irq0); 6089 #endif 6090 } /* FILE: arch/i386/kernel/traps.c */ 6091 /* 6092 * linux/arch/i386/traps.c 6093 * 6094 * Copyright (C) 1991, 1992 Linus Torvalds 6095 */ 6096 6097 /* 'Traps.c' handles hardware traps and faults after we 6098 * have saved some state in 'asm.s'. */ 6099 #include 6100 #include 6101 #include 6102 #include 6103 #include 6104 #include 6105 #include 6106 #include 6107 #include 6108 #include 6109 #include 6110 #include 6111 6112 #ifdef CONFIG_MCA 6113 #include 6114 #include 6115 #endif 6116 6117 #include 6118 #include 6119 #include 6120 #include 6121 #include 6122 #include 6123 #include 6124 6125 #include 6126 6127 #ifdef CONFIG_X86_VISWS_APIC 6128 #include 6129 #include 6130 #include 6131 #endif 6132 6133 #include "irq.h" 6134 6135 asmlinkage int system_call(void); 6136 asmlinkage void lcall7(void); 6137 6138 struct desc_struct default_ldt = { 0, 0 }; 6139 6140 /* The IDT has to be page-aligned to simplify the Pentium 6141 * F0 0F bug workaround.. We have a special link segment 6142 * for this. */ 6143 struct desc_struct idt_table[256] 6144 __attribute__((__section__(".data.idt"))) = { {0, 0}, }; 6145 6146 static inline void console_verbose(void) 6147 { 6148 extern int console_loglevel; 6149 console_loglevel = 15; 6150 } 6151 6152 #define DO_ERROR(trapnr, signr, str, name, tsk) \ 6153 asmlinkage void do_##name(struct pt_regs * regs, \ 6154 long error_code) \ 6155 { \ 6156 tsk->tss.error_code = error_code; \ 6157 tsk->tss.trap_no = trapnr; \ 6158 force_sig(signr, tsk); \ 6159 die_if_no_fixup(str,regs,error_code); \ 6160 } 6161 6162 #define DO_VM86_ERROR(trapnr, signr, str, name, tsk) \ 6163 asmlinkage void do_##name(struct pt_regs * regs, \ 6164 long error_code) \ 6165 { \ 6166 lock_kernel(); \ 6167 if (regs->eflags & VM_MASK) { \ 6168 if (!handle_vm86_trap((struct kernel_vm86_regs *) \ 6169 regs, error_code, trapnr)) \ 6170 goto out; \ 6171 /* else fall through */ \ 6172 } \ 6173 tsk->tss.error_code = error_code; \ 6174 tsk->tss.trap_no = trapnr; \ 6175 force_sig(signr, tsk); \ 6176 die_if_kernel(str,regs,error_code); \ 6177 out: \ 6178 unlock_kernel(); 6179 } 6180 6181 void page_exception(void); 6182 6183 asmlinkage void divide_error(void); 6184 asmlinkage void debug(void); 6185 asmlinkage void nmi(void); 6186 asmlinkage void int3(void); 6187 asmlinkage void overflow(void); 6188 asmlinkage void bounds(void); 6189 asmlinkage void invalid_op(void); 6190 asmlinkage void device_not_available(void); 6191 asmlinkage void double_fault(void); 6192 asmlinkage void coprocessor_segment_overrun(void); 6193 asmlinkage void invalid_TSS(void); 6194 asmlinkage void segment_not_present(void); 6195 asmlinkage void stack_segment(void); 6196 asmlinkage void general_protection(void); 6197 asmlinkage void page_fault(void); 6198 asmlinkage void coprocessor_error(void); 6199 asmlinkage void reserved(void); 6200 asmlinkage void alignment_check(void); 6201 asmlinkage void spurious_interrupt_bug(void); 6202 6203 int kstack_depth_to_print = 24; 6204 6205 /* These constants are for searching for possible module 6206 * text segments. VMALLOC_OFFSET comes from 6207 * mm/vmalloc.c; MODULE_RANGE is a guess of how much 6208 * space is likely to be vmalloced. */ 6209 #define VMALLOC_OFFSET (8*1024*1024) 6210 #define MODULE_RANGE (8*1024*1024) 6211 6212 static void show_registers(struct pt_regs *regs) 6213 { 6214 int i; 6215 int in_kernel = 1; 6216 unsigned long esp; 6217 unsigned short ss; 6218 unsigned long *stack, addr, module_start, module_end; 6219 6220 esp = (unsigned long) (1+regs); 6221 ss = __KERNEL_DS; 6222 if (regs->xcs & 3) { 6223 in_kernel = 0; 6224 esp = regs->esp; 6225 ss = regs->xss & 0xffff; 6226 } 6227 printk("CPU: %d\nEIP: %04x:[<%08lx>]" 6228 "\nEFLAGS: %08lx\n", smp_processor_id(), 6229 0xffff & regs->xcs, regs->eip, regs->eflags); 6230 printk("eax: %08lx ebx: %08lx ecx: %08lx " 6231 "edx: %08lx\n", 6232 regs->eax, regs->ebx, regs->ecx, regs->edx); 6233 printk("esi: %08lx edi: %08lx ebp: %08lx " 6234 "esp: %08lx\n", 6235 regs->esi, regs->edi, regs->ebp, esp); 6236 printk("ds: %04x es: %04x ss: %04x\n", 6237 regs->xds & 0xffff, regs->xes & 0xffff, ss); 6238 store_TR(i); 6239 printk("Process %s (pid: %d, process nr: %d, " 6240 "stackpage=%08lx)", current->comm, current->pid, 6241 0xffff & i, 4096+(unsigned long)current); 6242 6243 /* When in-kernel, we also print out the stack and code 6244 * at the time of the fault.. */ 6245 if (in_kernel) { 6246 printk("\nStack: "); 6247 stack = (unsigned long *) esp; 6248 for(i=0; i < kstack_depth_to_print; i++) { 6249 if (((long) stack & 4095) == 0) 6250 break; 6251 if (i && ((i % 8) == 0)) 6252 printk("\n "); 6253 printk("%08lx ", *stack++); 6254 } 6255 printk("\nCall Trace: "); 6256 stack = (unsigned long *) esp; 6257 i = 1; 6258 module_start = PAGE_OFFSET + (max_mapnr<= (unsigned long) &_stext) && 6271 (addr <= (unsigned long) &_etext)) || 6272 ((addr >= module_start) && 6273 (addr <= module_end))) { 6274 if (i && ((i % 8) == 0)) 6275 printk("\n "); 6276 printk("[<%08lx>] ", addr); 6277 i++; 6278 } 6279 } 6280 printk("\nCode: "); 6281 for(i=0;i<20;i++) 6282 printk("%02x ", ((unsigned char *)regs->eip)[i]); 6283 } 6284 printk("\n"); 6285 } 6286 6287 spinlock_t die_lock; 6288 6289 void die(const char * str, struct pt_regs * regs, 6290 long err) 6291 { 6292 console_verbose(); 6293 spin_lock_irq(&die_lock); 6294 printk("%s: %04lx\n", str, err & 0xffff); 6295 show_registers(regs); 6296 spin_unlock_irq(&die_lock); 6297 do_exit(SIGSEGV); 6298 } 6299 6300 static inline void die_if_kernel(const char * str, 6301 struct pt_regs * regs, long err) 6302 { 6303 if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs)) 6304 die(str, regs, err); 6305 } 6306 6307 static void die_if_no_fixup(const char * str, 6308 struct pt_regs * regs, long err) 6309 { 6310 if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs)) 6311 { 6312 unsigned long fixup; 6313 fixup = search_exception_table(regs->eip); 6314 if (fixup) { 6315 regs->eip = fixup; 6316 return; 6317 } 6318 die(str, regs, err); 6319 } 6320 } 6321 6322 DO_VM86_ERROR( 0, SIGFPE, "divide error", divide_error, 6323 current) 6324 DO_VM86_ERROR( 3, SIGTRAP, "int3", int3, current) 6325 DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow, current) 6326 DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds, current) 6327 DO_ERROR( 6, SIGILL, "invalid operand", invalid_op, 6328 current) 6329 DO_VM86_ERROR( 7, SIGSEGV, "device not available", 6330 device_not_available, current) 6331 DO_ERROR( 8, SIGSEGV, "double fault", double_fault, 6332 current) 6333 DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", 6334 coprocessor_segment_overrun, current) 6335 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS, 6336 current) 6337 DO_ERROR(11, SIGBUS, "segment not present", 6338 segment_not_present, current) 6339 DO_ERROR(12, SIGBUS, "stack segment", stack_segment, 6340 current) 6341 DO_ERROR(17, SIGSEGV, "alignment check", alignment_check, 6342 current) 6343 DO_ERROR(18, SIGSEGV, "reserved", reserved, current) 6344 /* I don't have documents for this but it does seem to 6345 * cover the cache flush from user space exception some 6346 * people get. */ 6347 DO_ERROR(19, SIGSEGV, "cache flush denied", 6348 cache_flush_denied, current) 6349 6350 asmlinkage void cache_flush_denied(struct pt_regs * regs, 6351 long error_code) 6352 { 6353 if (regs->eflags & VM_MASK) { 6354 handle_vm86_fault((struct kernel_vm86_regs *) regs, 6355 error_code); 6356 return; 6357 } 6358 die_if_kernel("cache flush denied",regs,error_code); 6359 current->tss.error_code = error_code; 6360 current->tss.trap_no = 19; 6361 force_sig(SIGSEGV, current); 6362 } 6363 6364 asmlinkage void do_general_protection( 6365 struct pt_regs * regs, long error_code) 6366 { 6367 if (regs->eflags & VM_MASK) 6368 goto gp_in_vm86; 6369 6370 if (!(regs->xcs & 3)) 6371 goto gp_in_kernel; 6372 6373 current->tss.error_code = error_code; 6374 current->tss.trap_no = 13; 6375 force_sig(SIGSEGV, current); 6376 return; 6377 6378 gp_in_vm86: 6379 lock_kernel(); 6380 handle_vm86_fault((struct kernel_vm86_regs *) regs, 6381 error_code); 6382 unlock_kernel(); 6383 return; 6384 6385 gp_in_kernel: 6386 { 6387 unsigned long fixup; 6388 fixup = search_exception_table(regs->eip); 6389 if (fixup) { 6390 regs->eip = fixup; 6391 return; 6392 } 6393 die("general protection fault", regs, error_code); 6394 } 6395 } 6396 6397 static void mem_parity_error(unsigned char reason, 6398 struct pt_regs * regs) 6399 { 6400 printk("Uhhuh. NMI received. Dazed and confused, " 6401 "but trying to continue\n"); 6402 printk("You probably have a hardware problem with " 6403 "your RAM chips\n"); 6404 } 6405 6406 static void io_check_error(unsigned char reason, 6407 struct pt_regs * regs) 6408 { 6409 unsigned long i; 6410 6411 printk("NMI: IOCK error (debug interrupt?)\n"); 6412 show_registers(regs); 6413 6414 /* Re-enable the IOCK line, wait for a few seconds */ 6415 reason |= 8; 6416 outb(reason, 0x61); 6417 i = 2000; 6418 while (--i) udelay(1000); 6419 reason &= ~8; 6420 outb(reason, 0x61); 6421 } 6422 6423 static void unknown_nmi_error(unsigned char reason, 6424 struct pt_regs * regs) 6425 { 6426 #ifdef CONFIG_MCA 6427 /* Might actually be able to figure out what the guilty 6428 * party is. */ 6429 if( MCA_bus ) { 6430 mca_handle_nmi(); 6431 return; 6432 } 6433 #endif 6434 printk("Uhhuh. NMI received for unknown reason %02x.\n" 6435 , reason); 6436 printk("Dazed and confused, but trying to continue\n"); 6437 printk("Do you have a strange power saving mode " 6438 "enabled?\n"); 6439 } 6440 6441 asmlinkage void do_nmi(struct pt_regs * regs, 6442 long error_code) 6443 { 6444 unsigned char reason = inb(0x61); 6445 extern atomic_t nmi_counter; 6446 6447 atomic_inc(&nmi_counter); 6448 if (reason & 0x80) 6449 mem_parity_error(reason, regs); 6450 if (reason & 0x40) 6451 io_check_error(reason, regs); 6452 if (!(reason & 0xc0)) 6453 unknown_nmi_error(reason, regs); 6454 } 6455 6456 /* Careful - we must not do a lock-kernel until we have 6457 * checked that the debug fault happened in user 6458 * mode. Getting debug exceptions while in the kernel has 6459 * to be handled without locking, to avoid deadlocks.. 6460 * 6461 * Being careful here means that we don't have to be as 6462 * careful in a lot of more complicated places (task 6463 * switching can be a bit lazy about restoring all the 6464 * debug state, and ptrace doesn't have to find every 6465 * occurrence of the TF bit that could be saved away even 6466 * by user code - and we don't have to be careful about 6467 * what values can be written to the debug registers 6468 * because there are no really bad cases). */ 6469 asmlinkage void do_debug(struct pt_regs * regs, 6470 long error_code) 6471 { 6472 unsigned int condition; 6473 struct task_struct *tsk = current; 6474 6475 if (regs->eflags & VM_MASK) 6476 goto debug_vm86; 6477 6478 __asm__ __volatile__("movl %%db6,%0" : "=r" 6479 (condition)); 6480 6481 /* Mask out spurious TF errors due to lazy TF 6482 * clearing */ 6483 if (condition & DR_STEP) { 6484 /* The TF error should be masked out only if the 6485 * current process is not traced and if the TRAP flag 6486 * has been set previously by a tracing process 6487 * (condition detected by the PF_DTRACE flag); 6488 * remember that the i386 TRAP flag can be modified 6489 * by the process itself in user mode, allowing 6490 * programs to debug themselves without the ptrace() 6491 * interface. */ 6492 if ((tsk->flags & (PF_DTRACE|PF_PTRACED)) == 6493 PF_DTRACE) 6494 goto clear_TF; 6495 } 6496 6497 /* Mask out spurious debug traps due to lazy DR7 6498 * setting */ 6499 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)){ 6500 if (!tsk->tss.debugreg[7]) 6501 goto clear_dr7; 6502 } 6503 6504 /* If this is a kernel mode trap, we need to reset db7 6505 * to allow us to continue sanely */ 6506 if ((regs->xcs & 3) == 0) 6507 goto clear_dr7; 6508 6509 /* Ok, finally something we can handle */ 6510 tsk->tss.trap_no = 1; 6511 tsk->tss.error_code = error_code; 6512 force_sig(SIGTRAP, tsk); 6513 return; 6514 6515 debug_vm86: 6516 lock_kernel(); 6517 handle_vm86_trap((struct kernel_vm86_regs *) regs, 6518 error_code, 1); 6519 unlock_kernel(); 6520 return; 6521 6522 clear_dr7: 6523 __asm__("movl %0,%%db7" 6524 : /* no output */ 6525 : "r" (0)); 6526 return; 6527 6528 clear_TF: 6529 regs->eflags &= ~TF_MASK; 6530 return; 6531 } 6532 6533 /* Note that we play around with the 'TS' bit in an 6534 * attempt to get the correct behaviour even in the 6535 * presence of the asynchronous IRQ13 behaviour */ 6536 void math_error(void) 6537 { 6538 struct task_struct * task; 6539 6540 /* Save the info for the exception handler (this will 6541 * also clear the error) */ 6542 task = current; 6543 save_fpu(task); 6544 task->tss.trap_no = 16; 6545 task->tss.error_code = 0; 6546 force_sig(SIGFPE, task); 6547 } 6548 6549 asmlinkage void do_coprocessor_error( 6550 struct pt_regs * regs, long error_code) 6551 { 6552 ignore_irq13 = 1; 6553 math_error(); 6554 } 6555 6556 asmlinkage void do_spurious_interrupt_bug( 6557 struct pt_regs * regs, long error_code) 6558 { 6559 #if 0 6560 /* No need to warn about this any longer. */ 6561 printk("Ignoring P6 Local APIC Spurious Interrupt " 6562 "Bug...\n"); 6563 #endif 6564 } 6565 6566 /* 'math_state_restore()' saves the current math 6567 * information in the old math state array, and gets the 6568 * new ones from the current task 6569 * 6570 * Careful.. There are problems with IBM-designed IRQ13 6571 * behaviour. Don't touch unless you *really* know how 6572 * it works. */ 6573 asmlinkage void math_state_restore(struct pt_regs regs) 6574 { 6575 /* Allow maths ops (or we recurse) */ 6576 __asm__ __volatile__("clts"); 6577 if(current->used_math) 6578 __asm__("frstor %0": :"m" (current->tss.i387)); 6579 else 6580 { 6581 /* Our first FPU usage, clean the chip. */ 6582 __asm__("fninit"); 6583 current->used_math = 1; 6584 } 6585 /* So we fnsave on switch_to() */ 6586 current->flags|=PF_USEDFPU; 6587 } 6588 6589 #ifndef CONFIG_MATH_EMULATION 6590 6591 asmlinkage void math_emulate(long arg) 6592 { 6593 lock_kernel(); 6594 printk("math-emulation not enabled and no coprocessor " 6595 "found.\n"); 6596 printk("killing %s.\n",current->comm); 6597 force_sig(SIGFPE,current); 6598 schedule(); 6599 unlock_kernel(); 6600 } 6601 6602 #endif /* CONFIG_MATH_EMULATION */ 6603 6604 __initfunc(void trap_init_f00f_bug(void)) 6605 { 6606 unsigned long page; 6607 pgd_t * pgd; 6608 pmd_t * pmd; 6609 pte_t * pte; 6610 6611 /* Allocate a new page in virtual address space, move 6612 * the IDT into it and write protect this page. */ 6613 page = (unsigned long) vmalloc(PAGE_SIZE); 6614 pgd = pgd_offset(&init_mm, page); 6615 pmd = pmd_offset(pgd, page); 6616 pte = pte_offset(pmd, page); 6617 free_page(pte_page(*pte)); 6618 *pte = mk_pte(&idt_table, PAGE_KERNEL_RO); 6619 local_flush_tlb(); 6620 6621 /* "idt" is magic - it overlaps the idt_descr variable 6622 * so that updating idt will automatically update the 6623 * idt descriptor.. */ 6624 idt = (struct desc_struct *)page; 6625 __asm__ __volatile__("lidt %0": "=m" (idt_descr)); 6626 } 6627 6628 #define _set_gate(gate_addr,type,dpl,addr) \ 6629 do { \ 6630 int __d0, __d1; \ 6631 __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ 6632 "movw %4,%%dx\n\t" \ 6633 "movl %%eax,%0\n\t" \ 6634 "movl %%edx,%1" \ 6635 :"=m" (*((long *) (gate_addr))), \ 6636 "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), \ 6637 "=&d" (__d1) \ 6638 :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ 6639 "3" ((char *) (addr)),"2" (__KERNEL_CS << 16)); 6640 } while (0) 6641 6642 6643 /* This needs to use 'idt_table' rather than 'idt', and 6644 * thus use the _nonmapped_ version of the IDT, as the 6645 * Pentium F0 0F bugfix can have resulted in the mapped 6646 * IDT being write-protected. */ 6647 void set_intr_gate(unsigned int n, void *addr) 6648 { 6649 _set_gate(idt_table+n,14,0,addr); 6650 } 6651 6652 static void __init set_trap_gate(unsigned int n, 6653 void *addr) 6654 { 6655 _set_gate(idt_table+n,15,0,addr); 6656 } 6657 6658 static void __init set_system_gate(unsigned int n, 6659 void *addr) 6660 { 6661 _set_gate(idt_table+n,15,3,addr); 6662 } 6663 6664 static void __init set_call_gate(void *a, void *addr) 6665 { 6666 _set_gate(a,12,3,addr); 6667 } 6668 6669 #define _set_seg_desc(gate_addr,type,dpl,base,limit) { \ 6670 *((gate_addr)+1) = ((base) & 0xff000000) | \ 6671 (((base) & 0x00ff0000)>>16) | \ 6672 ((limit) & 0xf0000) | \ 6673 ((dpl)<<13) | \ 6674 (0x00408000) | \ 6675 ((type)<<8); \ 6676 *(gate_addr) = (((base) & 0x0000ffff)<<16) | \ 6677 ((limit) & 0x0ffff); } 6678 6679 #define _set_tssldt_desc(n,addr,limit,type) \ 6680 __asm__ __volatile__ ("movw %3,0(%2)\n\t" \ 6681 "movw %%ax,2(%2)\n\t" \ 6682 "rorl $16,%%eax\n\t" \ 6683 "movb %%al,4(%2)\n\t" \ 6684 "movb %4,5(%2)\n\t" \ 6685 "movb $0,6(%2)\n\t" \ 6686 "movb %%ah,7(%2)\n\t" \ 6687 "rorl $16,%%eax" \ 6688 : "=m"(*(n)) : "a" (addr), "r"(n), "ir"(limit), \ 6689 "i"(type)) 6690 6691 void set_tss_desc(unsigned int n, void *addr) 6692 { 6693 _set_tssldt_desc(gdt_table+FIRST_TSS_ENTRY+(n<<1), 6694 (int)addr, 235, 0x89); 6695 } 6696 6697 void set_ldt_desc(unsigned int n, void *addr, 6698 unsigned int size) 6699 { 6700 _set_tssldt_desc(gdt_table+FIRST_LDT_ENTRY+(n<<1), 6701 (int)addr, ((size << 3) - 1), 0x82); 6702 } 6703 6704 #ifdef CONFIG_X86_VISWS_APIC 6705 6706 /* On Rev 005 motherboards legacy device interrupt lines 6707 * are wired directly to Lithium from the 307. But the 6708 * PROM leaves the interrupt type of each 307 logical 6709 * device set appropriate for the 8259. Later we'll 6710 * actually use the 8259, but for now we have to flip the 6711 * interrupt types to level triggered, active lo as 6712 * required by Lithium. */ 6713 #define REG 0x2e /* The register to read/write */ 6714 #define DEV 0x07 /* Register: Logical device select */ 6715 #define VAL 0x2f /* The value to read/write */ 6716 6717 static void 6718 superio_outb(int dev, int reg, int val) 6719 { 6720 outb(DEV, REG); 6721 outb(dev, VAL); 6722 outb(reg, REG); 6723 outb(val, VAL); 6724 } 6725 6726 static int __attribute__ ((unused)) 6727 superio_inb(int dev, int reg) 6728 { 6729 outb(DEV, REG); 6730 outb(dev, VAL); 6731 outb(reg, REG); 6732 return inb(VAL); 6733 } 6734 6735 #define FLOP 3 /* floppy logical device */ 6736 #define PPORT 4 /* parallel logical device */ 6737 #define UART5 5 /* uart2 logical device (not wired up) */ 6738 #define UART6 6 /* uart1 logical device 6739 * (THIS is the serial port!) */ 6740 #define IDEST 0x70 /* int. destination 6741 * (which 307 IRQ line) reg. */ 6742 #define ITYPE 0x71 /* interrupt type register */ 6743 6744 /* interrupt type bits */ 6745 #define LEVEL 0x01 /* bit 0, 0 == edge triggered */ 6746 #define ACTHI 0x02 /* bit 1, 0 == active lo */ 6747 6748 static void 6749 superio_init(void) 6750 { 6751 if (visws_board_type == VISWS_320 && 6752 visws_board_rev == 5) { 6753 /* 0 means no intr propagated */ 6754 superio_outb(UART6, IDEST, 0); 6755 printk("SGI 320 rev 5: " 6756 "disabling 307 uart1 interrupt\n"); 6757 } 6758 } 6759 6760 static void 6761 lithium_init(void) 6762 { 6763 set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS); 6764 printk("Lithium PCI Bridge A, Bus Number: %d\n", 6765 li_pcia_read16(LI_PCI_BUSNUM) & 0xff); 6766 set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS); 6767 printk("Lithium PCI Bridge B (PIIX4), Bus Number: " 6768 "%d\n", li_pcib_read16(LI_PCI_BUSNUM) & 0xff); 6769 6770 /* XXX blindly enables all interrupts */ 6771 li_pcia_write16(LI_PCI_INTEN, 0xffff); 6772 li_pcib_write16(LI_PCI_INTEN, 0xffff); 6773 } 6774 6775 static void 6776 cobalt_init(void) 6777 { 6778 /* On normal SMP PC this is used only with SMP, but we 6779 * have to use it and set it up here to start the 6780 * Cobalt clock */ 6781 set_fixmap(FIX_APIC_BASE, APIC_PHYS_BASE); 6782 printk("Local APIC ID %lx\n", apic_read(APIC_ID)); 6783 printk("Local APIC Version %lx\n", 6784 apic_read(APIC_VERSION)); 6785 6786 set_fixmap(FIX_CO_CPU, CO_CPU_PHYS); 6787 printk("Cobalt Revision %lx\n", 6788 co_cpu_read(CO_CPU_REV)); 6789 6790 set_fixmap(FIX_CO_APIC, CO_APIC_PHYS); 6791 printk("Cobalt APIC ID %lx\n", 6792 co_apic_read(CO_APIC_ID)); 6793 6794 /* Enable Cobalt APIC being careful to NOT change the 6795 * ID! */ 6796 co_apic_write(CO_APIC_ID, 6797 co_apic_read(CO_APIC_ID)|CO_APIC_ENABLE); 6798 6799 printk("Cobalt APIC enabled: ID reg %lx\n", 6800 co_apic_read(CO_APIC_ID)); 6801 } 6802 #endif 6803 void __init trap_init(void) 6804 { 6805 if (readl(0x0FFFD9) == 6806 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24)) 6807 EISA_bus = 1; 6808 set_call_gate(&default_ldt,lcall7); 6809 set_trap_gate(0,÷_error); 6810 set_trap_gate(1,&debug); 6811 set_trap_gate(2,&nmi); 6812 /* int3-5 can be called from all */ 6813 set_system_gate(3,&int3); 6814 set_system_gate(4,&overflow); 6815 set_system_gate(5,&bounds); 6816 set_trap_gate(6,&invalid_op); 6817 set_trap_gate(7,&device_not_available); 6818 set_trap_gate(8,&double_fault); 6819 set_trap_gate(9,&coprocessor_segment_overrun); 6820 set_trap_gate(10,&invalid_TSS); 6821 set_trap_gate(11,&segment_not_present); 6822 set_trap_gate(12,&stack_segment); 6823 set_trap_gate(13,&general_protection); 6824 set_trap_gate(14,&page_fault); 6825 set_trap_gate(15,&spurious_interrupt_bug); 6826 set_trap_gate(16,&coprocessor_error); 6827 set_trap_gate(17,&alignment_check); 6828 set_system_gate(SYSCALL_VECTOR,&system_call); 6829 6830 /* set up GDT task & ldt entries */ 6831 set_tss_desc(0, &init_task.tss); 6832 set_ldt_desc(0, &default_ldt, 1); 6833 6834 /* Clear NT, so that we won't have troubles with that 6835 * later on */ 6836 __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); 6837 load_TR(0); 6838 load_ldt(0); 6839 #ifdef CONFIG_X86_VISWS_APIC 6840 superio_init(); 6841 lithium_init(); 6842 cobalt_init(); 6843 #endif 6844 } /* FILE: arch/i386/lib/delay.c */ 6845 /* 6846 * Precise Delay Loops for i386 6847 * 6848 * Copyright (C) 1993 Linus Torvalds 6849 * Copyright (C) 1997 Martin Mares 6850 * 6851 * 6852 * The __delay function must _NOT_ be inlined as its 6853 * execution time depends wildly on alignment on many x86 6854 * processors. The additional jump magic is needed to get 6855 * the timing stable on all the CPU's we have to worry 6856 * about. 6857 */ 6858 6859 #include 6860 #include 6861 6862 #ifdef __SMP__ 6863 #include 6864 #endif 6865 6866 void __delay(unsigned long loops) 6867 { 6868 int d0; 6869 __asm__ __volatile__( 6870 "\tjmp 1f\n" 6871 ".align 16\n" 6872 "1:\tjmp 2f\n" 6873 ".align 16\n" 6874 "2:\tdecl %0\n\tjns 2b" 6875 :"=&a" (d0) 6876 :"0" (loops)); 6877 } 6878 6879 inline void __const_udelay(unsigned long xloops) 6880 { 6881 int d0; 6882 __asm__("mull %0" 6883 :"=d" (xloops), "=&a" (d0) 6884 :"1" (xloops),"0" (current_cpu_data.loops_per_sec)); 6885 __delay(xloops); 6886 } 6887 6888 void __udelay(unsigned long usecs) 6889 { 6890 __const_udelay(usecs * 0x000010c6); /* 2**32/1000000 */ 6891 } /* FILE: arch/i386/mm/fault.c */ 6892 /* 6893 * linux/arch/i386/mm/fault.c 6894 * 6895 * Copyright (C) 1995 Linus Torvalds 6896 */ 6897 6898 #include 6899 #include 6900 #include 6901 #include 6902 #include 6903 #include 6904 #include 6905 #include 6906 #include 6907 #include 6908 #include 6909 #include 6910 6911 #include 6912 #include 6913 #include 6914 #include 6915 6916 extern void die(const char *,struct pt_regs *,long); 6917 6918 /* Ugly, ugly, but the goto's result in better assembly.. 6919 */ 6920 int __verify_write(const void * addr, unsigned long size) 6921 { 6922 struct vm_area_struct * vma; 6923 unsigned long start = (unsigned long) addr; 6924 6925 if (!size) 6926 return 1; 6927 6928 vma = find_vma(current->mm, start); 6929 if (!vma) 6930 goto bad_area; 6931 if (vma->vm_start > start) 6932 goto check_stack; 6933 6934 good_area: 6935 if (!(vma->vm_flags & VM_WRITE)) 6936 goto bad_area; 6937 size--; 6938 size += start & ~PAGE_MASK; 6939 size >>= PAGE_SHIFT; 6940 start &= PAGE_MASK; 6941 6942 for (;;) { 6943 handle_mm_fault(current,vma, start, 1); 6944 if (!size) 6945 break; 6946 size--; 6947 start += PAGE_SIZE; 6948 if (start < vma->vm_end) 6949 continue; 6950 vma = vma->vm_next; 6951 if (!vma || vma->vm_start != start) 6952 goto bad_area; 6953 if (!(vma->vm_flags & VM_WRITE)) 6954 goto bad_area;; 6955 } 6956 return 1; 6957 6958 check_stack: 6959 if (!(vma->vm_flags & VM_GROWSDOWN)) 6960 goto bad_area; 6961 if (expand_stack(vma, start) == 0) 6962 goto good_area; 6963 6964 bad_area: 6965 return 0; 6966 } 6967 6968 asmlinkage void do_invalid_op(struct pt_regs *, 6969 unsigned long); 6970 extern unsigned long idt; 6971 6972 /* This routine handles page faults. It determines the 6973 * address, and the problem, and then passes it off to 6974 * one of the appropriate routines. 6975 * 6976 * error_code: 6977 * bit 0 == 0 means no page found, 1 means prot fault 6978 * bit 1 == 0 means read, 1 means write 6979 * bit 2 == 0 means kernel, 1 means user-mode */ 6980 asmlinkage void do_page_fault(struct pt_regs *regs, 6981 unsigned long error_code) 6982 { 6983 struct task_struct *tsk; 6984 struct mm_struct *mm; 6985 struct vm_area_struct * vma; 6986 unsigned long address; 6987 unsigned long page; 6988 unsigned long fixup; 6989 int write; 6990 6991 /* get the address */ 6992 __asm__("movl %%cr2,%0":"=r" (address)); 6993 6994 tsk = current; 6995 mm = tsk->mm; 6996 6997 /* If we're in an interrupt or have no user context, we 6998 * must not take the fault.. */ 6999 if (in_interrupt() || mm == &init_mm) 7000 goto no_context; 7001 7002 down(&mm->mmap_sem); 7003 7004 vma = find_vma(mm, address); 7005 if (!vma) 7006 goto bad_area; 7007 if (vma->vm_start <= address) 7008 goto good_area; 7009 if (!(vma->vm_flags & VM_GROWSDOWN)) 7010 goto bad_area; 7011 if (error_code & 4) { 7012 /* accessing the stack below %esp is always a bug. 7013 * The "+ 32" is there due to some instructions (like 7014 * pusha) doing post-decrement on the stack and that 7015 * doesn't show up until later.. */ 7016 if (address + 32 < regs->esp) 7017 goto bad_area; 7018 } 7019 if (expand_stack(vma, address)) 7020 goto bad_area; 7021 /* Ok, we have a good vm_area for this memory access, so 7022 * we can handle it.. */ 7023 good_area: 7024 write = 0; 7025 switch (error_code & 3) { 7026 default: /* 3: write, present */ 7027 #ifdef TEST_VERIFY_AREA 7028 if (regs->cs == KERNEL_CS) 7029 printk("WP fault at %08lx\n", regs->eip); 7030 #endif 7031 /* fall through */ 7032 case 2: /* write, not present */ 7033 if (!(vma->vm_flags & VM_WRITE)) 7034 goto bad_area; 7035 write++; 7036 break; 7037 case 1: /* read, present */ 7038 goto bad_area; 7039 case 0: /* read, not present */ 7040 if (!(vma->vm_flags & (VM_READ | VM_EXEC))) 7041 goto bad_area; 7042 } 7043 7044 /* If for any reason at all we couldn't handle the 7045 * fault, make sure we exit gracefully rather than 7046 * endlessly redo the fault. */ 7047 if (!handle_mm_fault(tsk, vma, address, write)) 7048 goto do_sigbus; 7049 7050 /* Did it hit the DOS screen mem VA from vm86 mode? */ 7051 if (regs->eflags & VM_MASK) { 7052 unsigned long bit = (address - 0xA0000) >>PAGE_SHIFT; 7053 if (bit < 32) 7054 tsk->tss.screen_bitmap |= 1 << bit; 7055 } 7056 up(&mm->mmap_sem); 7057 return; 7058 7059 /* Something tried to access memory that isn't in our 7060 * memory map.. Fix it, but check if it's kernel or user 7061 * first.. */ 7062 bad_area: 7063 up(&mm->mmap_sem); 7064 7065 /* User mode accesses just cause a SIGSEGV */ 7066 if (error_code & 4) { 7067 tsk->tss.cr2 = address; 7068 tsk->tss.error_code = error_code; 7069 tsk->tss.trap_no = 14; 7070 force_sig(SIGSEGV, tsk); 7071 return; 7072 } 7073 7074 /* Pentium F0 0F C7 C8 bug workaround. */ 7075 if (boot_cpu_data.f00f_bug) { 7076 unsigned long nr; 7077 7078 nr = (address - idt) >> 3; 7079 7080 if (nr == 6) { 7081 do_invalid_op(regs, 0); 7082 return; 7083 } 7084 } 7085 7086 no_context: 7087 /* Are we prepared to handle this kernel fault? */ 7088 if ((fixup = search_exception_table(regs->eip)) != 0) { 7089 regs->eip = fixup; 7090 return; 7091 } 7092 7093 /* Oops. The kernel tried to access some bad page. We'll 7094 * have to terminate things with extreme prejudice. 7095 * First we check if it was the bootup rw-test, though.. 7096 */ 7097 if (boot_cpu_data.wp_works_ok < 0 && 7098 address == PAGE_OFFSET && (error_code & 1)) { 7099 boot_cpu_data.wp_works_ok = 1; 7100 pg0[0] = pte_val(mk_pte(PAGE_OFFSET, PAGE_KERNEL)); 7101 local_flush_tlb(); 7102 /* Beware: Black magic here. The printk is needed 7103 * here to flush CPU state on certain buggy 7104 * processors. */ 7105 printk("Ok"); 7106 return; 7107 } 7108 7109 if (address < PAGE_SIZE) 7110 printk(KERN_ALERT "Unable to handle kernel " 7111 "NULL pointer dereference"); 7112 else 7113 printk(KERN_ALERT "Unable to handle kernel " 7114 "paging request"); 7115 printk(" at virtual address %08lx\n",address); 7116 __asm__("movl %%cr3,%0" : "=r" (page)); 7117 printk(KERN_ALERT "current->tss.cr3 = %08lx, " 7118 "%%cr3 = %08lx\n", tsk->tss.cr3, page); 7119 page = ((unsigned long *) __va(page))[address >> 22]; 7120 printk(KERN_ALERT "*pde = %08lx\n", page); 7121 if (page & 1) { 7122 page &= PAGE_MASK; 7123 address &= 0x003ff000; 7124 page = ((unsigned long *) 7125 __va(page))[address >> PAGE_SHIFT]; 7126 printk(KERN_ALERT "*pte = %08lx\n", page); 7127 } 7128 die("Oops", regs, error_code); 7129 do_exit(SIGKILL); 7130 7131 /* We ran out of memory, or some other thing happened to 7132 * us that made us unable to handle the page fault 7133 * gracefully. */ 7134 do_sigbus: 7135 up(&mm->mmap_sem); 7136 7137 /* Send a sigbus, regardless of whether we were in 7138 * kernel or user mode. */ 7139 tsk->tss.cr2 = address; 7140 tsk->tss.error_code = error_code; 7141 tsk->tss.trap_no = 14; 7142 force_sig(SIGBUS, tsk); 7143 7144 /* Kernel mode? Handle exceptions or die */ 7145 if (!(error_code & 4)) 7146 goto no_context; 7147 } /* FILE: arch/i386/mm/init.c */ 7148 /* 7149 * linux/arch/i386/mm/init.c 7150 * 7151 * Copyright (C) 1995 Linus Torvalds 7152 */ 7153 7154 #include 7155 #include 7156 #include 7157 #include 7158 #include 7159 #include 7160 #include 7161 #include 7162 #include 7163 #include 7164 #include 7165 #include 7166 #include 7167 #ifdef CONFIG_BLK_DEV_INITRD 7168 #include 7169 #endif 7170 7171 #include 7172 #include 7173 #include 7174 #include 7175 #include 7176 #include 7177 7178 extern void show_net_buffers(void); 7179 extern unsigned long init_smp_mappings(unsigned long); 7180 7181 void __bad_pte_kernel(pmd_t *pmd) 7182 { 7183 printk("Bad pmd in pte_alloc: %08lx\n", pmd_val(*pmd)); 7184 pmd_val(*pmd) = _KERNPG_TABLE + __pa(BAD_PAGETABLE); 7185 } 7186 7187 void __bad_pte(pmd_t *pmd) 7188 { 7189 printk("Bad pmd in pte_alloc: %08lx\n", pmd_val(*pmd)); 7190 pmd_val(*pmd) = _PAGE_TABLE + __pa(BAD_PAGETABLE); 7191 } 7192 7193 pte_t *get_pte_kernel_slow(pmd_t *pmd, 7194 unsigned long offset) 7195 { 7196 pte_t *pte; 7197 7198 pte = (pte_t *) __get_free_page(GFP_KERNEL); 7199 if (pmd_none(*pmd)) { 7200 if (pte) { 7201 clear_page((unsigned long)pte); 7202 pmd_val(*pmd) = _KERNPG_TABLE + __pa(pte); 7203 return pte + offset; 7204 } 7205 pmd_val(*pmd) = _KERNPG_TABLE + __pa(BAD_PAGETABLE); 7206 return NULL; 7207 } 7208 free_page((unsigned long)pte); 7209 if (pmd_bad(*pmd)) { 7210 __bad_pte_kernel(pmd); 7211 return NULL; 7212 } 7213 return (pte_t *) pmd_page(*pmd) + offset; 7214 } 7215 7216 pte_t *get_pte_slow(pmd_t *pmd, unsigned long offset) 7217 { 7218 unsigned long pte; 7219 7220 pte = (unsigned long) __get_free_page(GFP_KERNEL); 7221 if (pmd_none(*pmd)) { 7222 if (pte) { 7223 clear_page(pte); 7224 pmd_val(*pmd) = _PAGE_TABLE + __pa(pte); 7225 return (pte_t *)(pte + offset); 7226 } 7227 pmd_val(*pmd) = _PAGE_TABLE + __pa(BAD_PAGETABLE); 7228 return NULL; 7229 } 7230 free_page(pte); 7231 if (pmd_bad(*pmd)) { 7232 __bad_pte(pmd); 7233 return NULL; 7234 } 7235 return (pte_t *) (pmd_page(*pmd) + offset); 7236 } 7237 7238 int do_check_pgt_cache(int low, int high) 7239 { 7240 int freed = 0; 7241 if(pgtable_cache_size > high) { 7242 do { 7243 if(pgd_quicklist) 7244 free_pgd_slow(get_pgd_fast()), freed++; 7245 if(pmd_quicklist) 7246 free_pmd_slow(get_pmd_fast()), freed++; 7247 if(pte_quicklist) 7248 free_pte_slow(get_pte_fast()), freed++; 7249 } while(pgtable_cache_size > low); 7250 } 7251 return freed; 7252 } 7253 7254 /* BAD_PAGE is the page that is used for page faults when 7255 * linux is out-of-memory. Older versions of linux just 7256 * did a do_exit(), but using this instead means there is 7257 * less risk for a process dying in kernel mode, possibly 7258 * leaving an inode unused etc.. 7259 * 7260 * BAD_PAGETABLE is the accompanying page-table: it is 7261 * initialized to point to BAD_PAGE entries. 7262 * 7263 * ZERO_PAGE is a special page that is used for 7264 * zero-initialized data and COW. */ 7265 pte_t * __bad_pagetable(void) 7266 { 7267 extern char empty_bad_page_table[PAGE_SIZE]; 7268 int d0, d1; 7269 7270 __asm__ __volatile__("cld ; rep ; stosl" 7271 : "=&D" (d0), "=&c" (d1) 7272 : "a" (pte_val(BAD_PAGE)), 7273 "0" ((long) empty_bad_page_table), 7274 "1" (PAGE_SIZE/4) 7275 : "memory"); 7276 return (pte_t *) empty_bad_page_table; 7277 } 7278 7279 pte_t __bad_page(void) 7280 { 7281 extern char empty_bad_page[PAGE_SIZE]; 7282 int d0, d1; 7283 7284 __asm__ __volatile__("cld ; rep ; stosl" 7285 : "=&D" (d0), "=&c" (d1) 7286 : "a" (0), 7287 "0" ((long) empty_bad_page), 7288 "1" (PAGE_SIZE/4) 7289 : "memory"); 7290 return pte_mkdirty(mk_pte((unsigned long)empty_bad_page 7291 , PAGE_SHARED)); 7292 } 7293 7294 void show_mem(void) 7295 { 7296 int i,free = 0,total = 0,reserved = 0; 7297 int shared = 0, cached = 0; 7298 7299 printk("Mem-info:\n"); 7300 show_free_areas(); 7301 printk("Free swap: %6dkB\n", 7302 nr_swap_pages<<(PAGE_SHIFT-10)); 7303 i = max_mapnr; 7304 while (i-- > 0) { 7305 total++; 7306 if (PageReserved(mem_map+i)) 7307 reserved++; 7308 else if (PageSwapCache(mem_map+i)) 7309 cached++; 7310 else if (!atomic_read(&mem_map[i].count)) 7311 free++; 7312 else 7313 shared += atomic_read(&mem_map[i].count) - 1; 7314 } 7315 printk("%d pages of RAM\n",total); 7316 printk("%d reserved pages\n",reserved); 7317 printk("%d pages shared\n",shared); 7318 printk("%d pages swap cached\n",cached); 7319 printk("%ld pages in page table cache\n", 7320 pgtable_cache_size); 7321 show_buffers(); 7322 #ifdef CONFIG_NET 7323 show_net_buffers(); 7324 #endif 7325 } 7326 7327 extern unsigned long free_area_init(unsigned long, 7328 unsigned long); 7329 7330 /* References to section boundaries */ 7331 7332 extern char _text, _etext, _edata, __bss_start, _end; 7333 extern char __init_begin, __init_end; 7334 7335 #define X86_CR4_VME 0x0001 /* enable vm86 extensions */ 7336 #define X86_CR4_PVI 0x0002 /* virt intrs flag enable */ 7337 #define X86_CR4_TSD 0x0004 /* disable tm stamp at ipl 3*/ 7338 #define X86_CR4_DE 0x0008 /* enable debug extensions */ 7339 #define X86_CR4_PSE 0x0010 /* enable pg size extensions*/ 7340 #define X86_CR4_PAE 0x0020 /* enable phys addr extnsns */ 7341 #define X86_CR4_MCE 0x0040 /* Machine check enable */ 7342 #define X86_CR4_PGE 0x0080 /* enable global pages */ 7343 #define X86_CR4_PCE 0x0100 /* enable performance counters 7344 * at ipl 3 */ 7345 7346 /* Save the cr4 feature set we're using (ie Pentium 4MB 7347 * enable and PPro Global page enable), so that any CPU's 7348 * that boot up after us can get the correct flags. */ 7349 unsigned long mmu_cr4_features __initdata = 0; 7350 7351 static inline void set_in_cr4(unsigned long mask) 7352 { 7353 mmu_cr4_features |= mask; 7354 __asm__("movl %%cr4,%%eax\n\t" 7355 "orl %0,%%eax\n\t" 7356 "movl %%eax,%%cr4\n" 7357 : : "irg" (mask) 7358 :"ax"); 7359 } 7360 7361 /* allocate page table(s) for compile-time fixed 7362 * mappings */ 7363 static unsigned long __init fixmap_init( 7364 unsigned long start_mem) 7365 { 7366 pgd_t * pg_dir; 7367 unsigned int idx; 7368 unsigned long address; 7369 7370 start_mem = PAGE_ALIGN(start_mem); 7371 7372 for (idx=1; idx <= __end_of_fixed_addresses; 7373 idx += PTRS_PER_PTE) 7374 { 7375 address =__fix_to_virt(__end_of_fixed_addresses-idx); 7376 pg_dir = swapper_pg_dir + (address >> PGDIR_SHIFT); 7377 memset((void *)start_mem, 0, PAGE_SIZE); 7378 pgd_val(*pg_dir) = _PAGE_TABLE | __pa(start_mem); 7379 start_mem += PAGE_SIZE; 7380 } 7381 7382 return start_mem; 7383 } 7384 7385 static void set_pte_phys (unsigned long vaddr, 7386 unsigned long phys) 7387 { 7388 pgprot_t prot; 7389 pte_t * pte; 7390 7391 pte = pte_offset(pmd_offset(pgd_offset_k(vaddr), vaddr) 7392 , vaddr); 7393 prot = PAGE_KERNEL; 7394 if (boot_cpu_data.x86_capability & X86_FEATURE_PGE) 7395 pgprot_val(prot) |= _PAGE_GLOBAL; 7396 set_pte(pte, mk_pte_phys(phys, prot)); 7397 7398 local_flush_tlb(); 7399 } 7400 7401 void set_fixmap (enum fixed_addresses idx, 7402 unsigned long phys) 7403 { 7404 unsigned long address = __fix_to_virt(idx); 7405 7406 if (idx >= __end_of_fixed_addresses) { 7407 printk("Invalid set_fixmap\n"); 7408 return; 7409 } 7410 set_pte_phys (address,phys); 7411 } 7412 7413 /* paging_init() sets up the page tables - note that the 7414 * first 4MB are already mapped by head.S. 7415 * 7416 * This routines also unmaps the page at virtual kernel 7417 * address 0, so that we can trap those pesky 7418 * NULL-reference errors in the kernel. */ 7419 __initfunc(unsigned long paging_init( 7420 unsigned long start_mem, unsigned long end_mem)) 7421 { 7422 pgd_t * pg_dir; 7423 pte_t * pg_table; 7424 unsigned long tmp; 7425 unsigned long address; 7426 7427 /* Physical page 0 is special; it's not touched by Linux 7428 * since BIOS and SMM (for laptops with [34]86/SL chips) 7429 * may need it. It is read and write protected to detect 7430 * null pointer references in the kernel. It may also 7431 * hold the MP configuration table when we are booting 7432 * SMP. */ 7433 start_mem = PAGE_ALIGN(start_mem); 7434 address = PAGE_OFFSET; 7435 pg_dir = swapper_pg_dir; 7436 /* unmap the original low memory mappings */ 7437 pgd_val(pg_dir[0]) = 0; 7438 7439 /* Map whole memory from PAGE_OFFSET */ 7440 pg_dir += USER_PGD_PTRS; 7441 while (address < end_mem) { 7442 /* If we're running on a Pentium CPU, we can use the 7443 * 4MB page tables. 7444 * 7445 * The page tables we create span up to the next 4MB 7446 * virtual memory boundary, but that's OK as we won't 7447 * use that memory anyway. */ 7448 if (boot_cpu_data.x86_capability & X86_FEATURE_PSE) { 7449 unsigned long __pe; 7450 7451 set_in_cr4(X86_CR4_PSE); 7452 boot_cpu_data.wp_works_ok = 1; 7453 __pe = _KERNPG_TABLE + _PAGE_4M + __pa(address); 7454 /* Make it "global" too if supported */ 7455 if(boot_cpu_data.x86_capability & X86_FEATURE_PGE){ 7456 set_in_cr4(X86_CR4_PGE); 7457 __pe += _PAGE_GLOBAL; 7458 } 7459 pgd_val(*pg_dir) = __pe; 7460 pg_dir++; 7461 address += 4*1024*1024; 7462 continue; 7463 } 7464 7465 /* We're on a [34]86, use normal page tables. 7466 * pg_table is physical at this point */ 7467 pg_table = (pte_t *) (PAGE_MASK & pgd_val(*pg_dir)); 7468 if (!pg_table) { 7469 pg_table = (pte_t *) __pa(start_mem); 7470 start_mem += PAGE_SIZE; 7471 } 7472 7473 pgd_val(*pg_dir) = _PAGE_TABLE | 7474 (unsigned long) pg_table; 7475 pg_dir++; 7476 7477 /* now change pg_table to kernel virtual addresses */ 7478 pg_table = (pte_t *) __va(pg_table); 7479 for (tmp = 0; tmp < PTRS_PER_PTE; tmp++,pg_table++) { 7480 pte_t pte = mk_pte(address, PAGE_KERNEL); 7481 if (address >= end_mem) 7482 pte_val(pte) = 0; 7483 set_pte(pg_table, pte); 7484 address += PAGE_SIZE; 7485 } 7486 } 7487 start_mem = fixmap_init(start_mem); 7488 #ifdef __SMP__ 7489 start_mem = init_smp_mappings(start_mem); 7490 #endif 7491 local_flush_tlb(); 7492 7493 return free_area_init(start_mem, end_mem); 7494 } 7495 7496 /* Test if the WP bit works in supervisor mode. It isn't 7497 * supported on 386's and also on some strange 486's 7498 * (NexGen etc.). All 586+'s are OK. The jumps before and 7499 * after the test are here to work-around some nasty CPU 7500 * bugs. */ 7501 __initfunc(void test_wp_bit(void)) 7502 { 7503 unsigned char tmp_reg; 7504 unsigned long old = pg0[0]; 7505 7506 printk("Checking if this processor honours the WP bit " 7507 "even in supervisor mode... "); 7508 pg0[0] = pte_val(mk_pte(PAGE_OFFSET, PAGE_READONLY)); 7509 local_flush_tlb(); 7510 current->mm->mmap->vm_start += PAGE_SIZE; 7511 __asm__ __volatile__( 7512 "jmp 1f; 1:\n" 7513 "movb %0,%1\n" 7514 "movb %1,%0\n" 7515 "jmp 1f; 1:\n" 7516 :"=m" (*(char *) __va(0)), 7517 "=q" (tmp_reg) 7518 :/* no inputs */ 7519 :"memory"); 7520 pg0[0] = old; 7521 local_flush_tlb(); 7522 current->mm->mmap->vm_start -= PAGE_SIZE; 7523 if (boot_cpu_data.wp_works_ok < 0) { 7524 boot_cpu_data.wp_works_ok = 0; 7525 printk("No.\n"); 7526 #ifdef CONFIG_X86_WP_WORKS_OK 7527 panic("This kernel doesn't support CPU's with broken" 7528 " WP. Recompile it for a 386!"); 7529 #endif 7530 } else 7531 printk(".\n"); 7532 } 7533 7534 __initfunc(void mem_init(unsigned long start_mem, 7535 unsigned long end_mem)) 7536 { 7537 unsigned long start_low_mem = PAGE_SIZE; 7538 int codepages = 0; 7539 int reservedpages = 0; 7540 int datapages = 0; 7541 int initpages = 0; 7542 unsigned long tmp; 7543 7544 end_mem &= PAGE_MASK; 7545 high_memory = (void *) end_mem; 7546 max_mapnr = num_physpages = MAP_NR(end_mem); 7547 7548 /* clear the zero-page */ 7549 memset(empty_zero_page, 0, PAGE_SIZE); 7550 7551 /* mark usable pages in the mem_map[] */ 7552 start_low_mem = PAGE_ALIGN(start_low_mem)+PAGE_OFFSET; 7553 7554 #ifdef __SMP__ 7555 /* But first pinch a few for the stack/trampoline stuff 7556 * FIXME: Don't need the extra page at 4K, but need to 7557 * fix trampoline before removing it. (see the GDT 7558 * stuff) */ 7559 start_low_mem += PAGE_SIZE; /* 32bit startup code */ 7560 /* AP processor stacks */ 7561 start_low_mem = smp_alloc_memory(start_low_mem); 7562 #endif 7563 start_mem = PAGE_ALIGN(start_mem); 7564 7565 /* IBM messed up *AGAIN* in their thinkpad: 0xA0000 -> 7566 * 0x9F000. They seem to have done something stupid 7567 * with the floppy controller as well.. */ 7568 while (start_low_mem < 0x9f000+PAGE_OFFSET) { 7569 clear_bit(PG_reserved, 7570 &mem_map[MAP_NR(start_low_mem)].flags); 7571 start_low_mem += PAGE_SIZE; 7572 } 7573 7574 while (start_mem < end_mem) { 7575 clear_bit(PG_reserved, 7576 &mem_map[MAP_NR(start_mem)].flags); 7577 start_mem += PAGE_SIZE; 7578 } 7579 for (tmp = PAGE_OFFSET; tmp < end_mem; 7580 tmp += PAGE_SIZE) { 7581 if (tmp >= MAX_DMA_ADDRESS) 7582 clear_bit(PG_DMA, &mem_map[MAP_NR(tmp)].flags); 7583 if (PageReserved(mem_map+MAP_NR(tmp))) { 7584 if (tmp >= (unsigned long) &_text && 7585 tmp < (unsigned long) &_edata) { 7586 if (tmp < (unsigned long) &_etext) 7587 codepages++; 7588 else 7589 datapages++; 7590 } else if (tmp >= (unsigned long) &__init_begin 7591 && tmp < (unsigned long) &__init_end) 7592 initpages++; 7593 else if (tmp >= (unsigned long) &__bss_start 7594 && tmp < (unsigned long) start_mem) 7595 datapages++; 7596 else 7597 reservedpages++; 7598 continue; 7599 } 7600 atomic_set(&mem_map[MAP_NR(tmp)].count, 1); 7601 #ifdef CONFIG_BLK_DEV_INITRD 7602 if (!initrd_start || (tmp < initrd_start || tmp >= 7603 initrd_end)) 7604 #endif 7605 free_page(tmp); 7606 } 7607 printk("Memory: %luk/%luk available (%dk kernel code, " 7608 "%dk reserved, %dk data, %dk init)\n", 7609 (unsigned long) nr_free_pages << (PAGE_SHIFT-10), 7610 max_mapnr << (PAGE_SHIFT-10), 7611 codepages << (PAGE_SHIFT-10), 7612 reservedpages << (PAGE_SHIFT-10), 7613 datapages << (PAGE_SHIFT-10), 7614 initpages << (PAGE_SHIFT-10)); 7615 7616 if (boot_cpu_data.wp_works_ok < 0) 7617 test_wp_bit(); 7618 } 7619 7620 void free_initmem(void) 7621 { 7622 unsigned long addr; 7623 7624 addr = (unsigned long)(&__init_begin); 7625 for (; addr < (unsigned long)(&__init_end); 7626 addr += PAGE_SIZE) { 7627 mem_map[MAP_NR(addr)].flags &= ~(1 << PG_reserved); 7628 atomic_set(&mem_map[MAP_NR(addr)].count, 1); 7629 free_page(addr); 7630 } 7631 printk("Freeing unused kernel memory: %dk freed\n", 7632 (&__init_end - &__init_begin) >> 10); 7633 } 7634 7635 void si_meminfo(struct sysinfo *val) 7636 { 7637 int i; 7638 7639 i = max_mapnr; 7640 val->totalram = 0; 7641 val->sharedram = 0; 7642 val->freeram = nr_free_pages << PAGE_SHIFT; 7643 val->bufferram = buffermem; 7644 while (i-- > 0) { 7645 if (PageReserved(mem_map+i)) 7646 continue; 7647 val->totalram++; 7648 if (!atomic_read(&mem_map[i].count)) 7649 continue; 7650 val->sharedram += atomic_read(&mem_map[i].count) - 1; 7651 } 7652 val->totalram <<= PAGE_SHIFT; 7653 val->sharedram <<= PAGE_SHIFT; 7654 return; 7655 } /* FILE: fs/binfmt_elf.c */ 7656 /* 7657 * linux/fs/binfmt_elf.c 7658 * 7659 * These are the functions used to load ELF format 7660 * executables as used on SVr4 machines. Information on 7661 * the format may be found in the book "UNIX SYSTEM V 7662 * RELEASE 4 Programmers Guide: Ansi C and Programming 7663 * Support Tools". 7664 * 7665 * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com). 7666 */ 7667 7668 #include 7669 7670 #include 7671 #include 7672 #include 7673 #include 7674 #include 7675 #include 7676 #include 7677 #include 7678 #include 7679 #include 7680 #include 7681 #include 7682 #include 7683 #include 7684 #include 7685 #include 7686 #include 7687 #include 7688 7689 #include 7690 #include 7691 7692 #include 7693 7694 #define DLINFO_ITEMS 13 7695 7696 #include 7697 7698 static int load_elf_binary(struct linux_binprm * bprm, 7699 struct pt_regs * regs); 7700 static int load_elf_library(int fd); 7701 extern int dump_fpu (struct pt_regs *, elf_fpregset_t *); 7702 extern void dump_thread(struct pt_regs *, struct user *); 7703 7704 #ifndef elf_addr_t 7705 #define elf_addr_t unsigned long 7706 #define elf_caddr_t char * 7707 #endif 7708 7709 /* If we don't support core dumping, then supply a NULL 7710 * so we don't even try. */ 7711 #ifdef USE_ELF_CORE_DUMP 7712 static int elf_core_dump(long signr, 7713 struct pt_regs * regs); 7714 #else 7715 #define elf_core_dump NULL 7716 #endif 7717 7718 #define ELF_PAGESTART(_v) \ 7719 ((_v) & ~(unsigned long)(ELF_EXEC_PAGESIZE-1)) 7720 #define ELF_PAGEOFFSET(_v) \ 7721 ((_v) & (ELF_EXEC_PAGESIZE-1)) 7722 #define ELF_PAGEALIGN(_v) \ 7723 (((_v) + ELF_EXEC_PAGESIZE - 1) & \ 7724 ~(ELF_EXEC_PAGESIZE - 1)) 7725 7726 static struct linux_binfmt elf_format = { 7727 #ifndef MODULE 7728 NULL, NULL, 7729 load_elf_binary, load_elf_library, elf_core_dump 7730 #else 7731 NULL, &__this_module, 7732 load_elf_binary, load_elf_library, elf_core_dump 7733 #endif 7734 }; 7735 7736 static void set_brk(unsigned long start, 7737 unsigned long end) 7738 { 7739 start = ELF_PAGEALIGN(start); 7740 end = ELF_PAGEALIGN(end); 7741 if (end <= start) 7742 return; 7743 do_mmap(NULL, start, end - start, 7744 PROT_READ | PROT_WRITE | PROT_EXEC, 7745 MAP_FIXED | MAP_PRIVATE, 0); 7746 } 7747 7748 7749 /* We need to explicitly zero any fractional pages 7750 after the data section (i.e. bss). This would 7751 contain the junk from the file that should not 7752 be in memory */ 7753 7754 7755 static void padzero(unsigned long elf_bss) 7756 { 7757 unsigned long nbyte; 7758 7759 nbyte = ELF_PAGEOFFSET(elf_bss); 7760 if (nbyte) { 7761 nbyte = ELF_EXEC_PAGESIZE - nbyte; 7762 clear_user((void *) elf_bss, nbyte); 7763 } 7764 } 7765 7766 static elf_addr_t * 7767 create_elf_tables(char *p, int argc, int envc, 7768 struct elfhdr * exec, 7769 unsigned long load_addr, 7770 unsigned long load_bias, 7771 unsigned long interp_load_addr, int ibcs) 7772 { 7773 elf_caddr_t *argv; 7774 elf_caddr_t *envp; 7775 elf_addr_t *sp, *csp; 7776 char *k_platform, *u_platform; 7777 long hwcap; 7778 size_t platform_len = 0; 7779 7780 /* Get hold of platform and hardware capabilities masks 7781 * for the machine we are running on. In some cases 7782 * (Sparc), this info is impossible to get, in others 7783 * (i386) it is merely difficult. */ 7784 hwcap = ELF_HWCAP; 7785 k_platform = ELF_PLATFORM; 7786 7787 if (k_platform) { 7788 platform_len = strlen(k_platform) + 1; 7789 u_platform = p - platform_len; 7790 __copy_to_user(u_platform, k_platform, platform_len); 7791 } else 7792 u_platform = p; 7793 7794 /* Force 16 byte _final_ alignment here for generality. 7795 * Leave an extra 16 bytes free so that on the PowerPC 7796 * we can move the aux table up to start on a 16-byte 7797 * boundary. */ 7798 sp = (elf_addr_t *) 7799 ((~15UL & (unsigned long)(u_platform)) - 16UL); 7800 csp = sp; 7801 csp -= ((exec ? DLINFO_ITEMS*2 : 4) + 7802 (k_platform ? 2 : 0)); 7803 csp -= envc+1; 7804 csp -= argc+1; 7805 csp -= (!ibcs ? 3 : 1); /* argc itself */ 7806 if ((unsigned long)csp & 15UL) 7807 sp -= ((unsigned long)csp & 15UL) / sizeof(*sp); 7808 7809 /* Put the ELF interpreter info on the stack */ 7810 #define NEW_AUX_ENT(nr, id, val) \ 7811 __put_user ((id), sp+(nr*2)); \ 7812 __put_user ((val), sp+(nr*2+1)); \ 7813 7814 sp -= 2; 7815 NEW_AUX_ENT(0, AT_NULL, 0); 7816 if (k_platform) { 7817 sp -= 2; 7818 NEW_AUX_ENT(0, AT_PLATFORM, 7819 (elf_addr_t)(unsigned long) u_platform); 7820 } 7821 sp -= 2; 7822 NEW_AUX_ENT(0, AT_HWCAP, hwcap); 7823 7824 if (exec) { 7825 sp -= 11*2; 7826 7827 NEW_AUX_ENT(0, AT_PHDR, load_addr + exec->e_phoff); 7828 NEW_AUX_ENT(1, AT_PHENT, sizeof (struct elf_phdr)); 7829 NEW_AUX_ENT(2, AT_PHNUM, exec->e_phnum); 7830 NEW_AUX_ENT(3, AT_PAGESZ, ELF_EXEC_PAGESIZE); 7831 NEW_AUX_ENT(4, AT_BASE, interp_load_addr); 7832 NEW_AUX_ENT(5, AT_FLAGS, 0); 7833 NEW_AUX_ENT(6, AT_ENTRY, load_bias + exec->e_entry); 7834 NEW_AUX_ENT(7, AT_UID, (elf_addr_t) current->uid); 7835 NEW_AUX_ENT(8, AT_EUID, (elf_addr_t) current->euid); 7836 NEW_AUX_ENT(9, AT_GID, (elf_addr_t) current->gid); 7837 NEW_AUX_ENT(10, AT_EGID, (elf_addr_t) current->egid); 7838 } 7839 #undef NEW_AUX_ENT 7840 7841 sp -= envc+1; 7842 envp = (elf_caddr_t *) sp; 7843 sp -= argc+1; 7844 argv = (elf_caddr_t *) sp; 7845 if (!ibcs) { 7846 __put_user((elf_addr_t)(unsigned long) envp,--sp); 7847 __put_user((elf_addr_t)(unsigned long) argv,--sp); 7848 } 7849 7850 __put_user((elf_addr_t)argc,--sp); 7851 current->mm->arg_start = (unsigned long) p; 7852 while (argc-->0) { 7853 __put_user((elf_caddr_t)(unsigned long)p,argv++); 7854 p += strlen_user(p); 7855 } 7856 __put_user(NULL, argv); 7857 current->mm->arg_end = 7858 current->mm->env_start = (unsigned long) p; 7859 while (envc-->0) { 7860 __put_user((elf_caddr_t)(unsigned long)p,envp++); 7861 p += strlen_user(p); 7862 } 7863 __put_user(NULL, envp); 7864 current->mm->env_end = (unsigned long) p; 7865 return sp; 7866 } 7867 7868 7869 /* This is much more generalized than the library routine 7870 read function, so we keep this separate. Technically 7871 the library read function is only provided so that we 7872 can read a.out libraries that have an ELF header */ 7873 7874 static unsigned long load_elf_interp( 7875 struct elfhdr * interp_elf_ex, 7876 struct dentry * interpreter_dentry, 7877 unsigned long *interp_load_addr) 7878 { 7879 struct file * file; 7880 struct elf_phdr *elf_phdata; 7881 struct elf_phdr *eppnt; 7882 unsigned long load_addr = 0; 7883 int load_addr_set = 0; 7884 unsigned long last_bss = 0, elf_bss = 0; 7885 unsigned long error = ~0UL; 7886 int elf_exec_fileno; 7887 int retval, i, size; 7888 7889 /* First of all, some simple consistency checks */ 7890 if (interp_elf_ex->e_type != ET_EXEC && 7891 interp_elf_ex->e_type != ET_DYN) 7892 goto out; 7893 if (!elf_check_arch(interp_elf_ex->e_machine)) 7894 goto out; 7895 if (!interpreter_dentry->d_inode->i_op || 7896 !interpreter_dentry->d_inode->i_op-> 7897 default_file_ops->mmap) 7898 goto out; 7899 7900 /* If the size of this structure has changed, then 7901 * punt, since we will be doing the wrong thing. */ 7902 if (interp_elf_ex->e_phentsize != 7903 sizeof(struct elf_phdr)) 7904 goto out; 7905 7906 /* Now read in all of the header information */ 7907 7908 size = sizeof(struct elf_phdr) *interp_elf_ex->e_phnum; 7909 if (size > ELF_EXEC_PAGESIZE) 7910 goto out; 7911 elf_phdata = 7912 (struct elf_phdr *) kmalloc(size, GFP_KERNEL); 7913 if (!elf_phdata) 7914 goto out; 7915 7916 retval = read_exec(interpreter_dentry, 7917 interp_elf_ex->e_phoff, 7918 (char *) elf_phdata, size, 1); 7919 error = retval; 7920 if (retval < 0) 7921 goto out_free; 7922 7923 error = ~0UL; 7924 elf_exec_fileno = open_dentry(interpreter_dentry, 7925 O_RDONLY); 7926 if (elf_exec_fileno < 0) 7927 goto out_free; 7928 file = fget(elf_exec_fileno); 7929 7930 eppnt = elf_phdata; 7931 for (i=0; ie_phnum; i++, eppnt++) { 7932 if (eppnt->p_type == PT_LOAD) { 7933 int elf_type = MAP_PRIVATE | MAP_DENYWRITE; 7934 int elf_prot = 0; 7935 unsigned long vaddr = 0; 7936 unsigned long k, map_addr; 7937 7938 if (eppnt->p_flags & PF_R) elf_prot = PROT_READ; 7939 if (eppnt->p_flags & PF_W) elf_prot |= PROT_WRITE; 7940 if (eppnt->p_flags & PF_X) elf_prot |= PROT_EXEC; 7941 vaddr = eppnt->p_vaddr; 7942 if (interp_elf_ex->e_type == ET_EXEC || 7943 load_addr_set) { 7944 elf_type |= MAP_FIXED; 7945 #ifdef __sparc__ 7946 } else { 7947 load_addr = get_unmapped_area(0, eppnt->p_filesz + 7948 ELF_PAGEOFFSET(vaddr)); 7949 #endif 7950 } 7951 7952 map_addr = do_mmap(file, 7953 load_addr + ELF_PAGESTART(vaddr), 7954 eppnt->p_filesz + 7955 ELF_PAGEOFFSET(eppnt->p_vaddr), 7956 elf_prot, 7957 elf_type, 7958 eppnt->p_offset - 7959 ELF_PAGEOFFSET(eppnt->p_vaddr)); 7960 if (map_addr > -1024UL) /* Real error */ 7961 goto out_close; 7962 7963 if (!load_addr_set && 7964 interp_elf_ex->e_type == ET_DYN) { 7965 load_addr = map_addr - ELF_PAGESTART(vaddr); 7966 load_addr_set = 1; 7967 } 7968 7969 /* Find the end of the file mapping for this phdr, 7970 * and keep track of the largest address we see for 7971 * this. */ 7972 k = load_addr + eppnt->p_vaddr + eppnt->p_filesz; 7973 if (k > elf_bss) 7974 elf_bss = k; 7975 7976 /* Do the same thing for the memory mapping - 7977 * between elf_bss and last_bss is the bss section. 7978 */ 7979 k = load_addr + eppnt->p_memsz + eppnt->p_vaddr; 7980 if (k > last_bss) 7981 last_bss = k; 7982 } 7983 } 7984 7985 /* Now use mmap to map the library into memory. */ 7986 7987 /* Now fill out the bss section. First pad the last 7988 * page up to the page boundary, and then perform a 7989 * mmap to make sure that there are zero-mapped pages 7990 * up to and including the last bss page. */ 7991 padzero(elf_bss); 7992 /* What we have mapped so far */ 7993 elf_bss = ELF_PAGESTART(elf_bss + 7994 ELF_EXEC_PAGESIZE - 1); 7995 7996 /* Map the last of the bss segment */ 7997 if (last_bss > elf_bss) 7998 do_mmap(NULL, elf_bss, last_bss - elf_bss, 7999 PROT_READ|PROT_WRITE|PROT_EXEC, 8000 MAP_FIXED|MAP_PRIVATE, 0); 8001 8002 *interp_load_addr = load_addr; 8003 error = 8004 ((unsigned long) interp_elf_ex->e_entry) + load_addr; 8005 8006 out_close: 8007 fput(file); 8008 sys_close(elf_exec_fileno); 8009 out_free: 8010 kfree(elf_phdata); 8011 out: 8012 return error; 8013 } 8014 8015 static unsigned long load_aout_interp( 8016 struct exec * interp_ex, 8017 struct dentry * interpreter_dentry) 8018 { 8019 unsigned long text_data, offset, elf_entry = ~0UL; 8020 char * addr; 8021 int retval; 8022 8023 current->mm->end_code = interp_ex->a_text; 8024 text_data = interp_ex->a_text + interp_ex->a_data; 8025 current->mm->end_data = text_data; 8026 current->mm->brk = interp_ex->a_bss + text_data; 8027 8028 switch (N_MAGIC(*interp_ex)) { 8029 case OMAGIC: 8030 offset = 32; 8031 addr = (char *) 0; 8032 break; 8033 case ZMAGIC: 8034 case QMAGIC: 8035 offset = N_TXTOFF(*interp_ex); 8036 addr = (char *) N_TXTADDR(*interp_ex); 8037 break; 8038 default: 8039 goto out; 8040 } 8041 8042 do_mmap(NULL, 0, text_data, 8043 PROT_READ|PROT_WRITE|PROT_EXEC, 8044 MAP_FIXED|MAP_PRIVATE, 0); 8045 retval = read_exec(interpreter_dentry, offset, addr, 8046 text_data, 0); 8047 if (retval < 0) 8048 goto out; 8049 flush_icache_range((unsigned long)addr, 8050 (unsigned long)addr + text_data); 8051 8052 do_mmap(NULL, 8053 ELF_PAGESTART(text_data + ELF_EXEC_PAGESIZE-1), 8054 interp_ex->a_bss, 8055 PROT_READ|PROT_WRITE|PROT_EXEC, 8056 MAP_FIXED|MAP_PRIVATE, 0); 8057 elf_entry = interp_ex->a_entry; 8058 8059 out: 8060 return elf_entry; 8061 } 8062 8063 /* These are the functions used to load ELF style 8064 * executables and shared libraries. There is no binary 8065 * dependent code anywhere else. */ 8066 8067 #define INTERPRETER_NONE 0 8068 #define INTERPRETER_AOUT 1 8069 #define INTERPRETER_ELF 2 8070 8071 8072 static inline int 8073 do_load_elf_binary(struct linux_binprm * bprm, 8074 struct pt_regs * regs) 8075 { 8076 struct file * file; 8077 struct dentry *interpreter_dentry = NULL; 8078 unsigned long load_addr = 0, load_bias; 8079 int load_addr_set = 0; 8080 char * elf_interpreter = NULL; 8081 unsigned int interpreter_type = INTERPRETER_NONE; 8082 unsigned char ibcs2_interpreter = 0; 8083 mm_segment_t old_fs; 8084 unsigned long error; 8085 struct elf_phdr * elf_ppnt, *elf_phdata; 8086 unsigned long elf_bss, k, elf_brk; 8087 int elf_exec_fileno; 8088 int retval, size, i; 8089 unsigned long elf_entry, interp_load_addr = 0; 8090 unsigned long start_code, end_code, end_data; 8091 struct elfhdr elf_ex; 8092 struct elfhdr interp_elf_ex; 8093 struct exec interp_ex; 8094 char passed_fileno[6]; 8095 8096 /* Get the exec-header */ 8097 elf_ex = *((struct elfhdr *) bprm->buf); 8098 8099 retval = -ENOEXEC; 8100 /* First of all, some simple consistency checks */ 8101 if (elf_ex.e_ident[0] != 0x7f || 8102 strncmp(&elf_ex.e_ident[1], "ELF", 3) != 0) 8103 goto out; 8104 8105 if (elf_ex.e_type != ET_EXEC && 8106 elf_ex.e_type != ET_DYN) 8107 goto out; 8108 if (!elf_check_arch(elf_ex.e_machine)) 8109 goto out; 8110 #ifdef __mips__ 8111 /* IRIX binaries handled elsewhere. */ 8112 if (elf_ex.e_flags & EF_MIPS_ARCH) { 8113 retval = -ENOEXEC; 8114 goto out; 8115 } 8116 #endif 8117 if (!bprm->dentry->d_inode->i_op || 8118 !bprm->dentry->d_inode->i_op->default_file_ops || 8119 !bprm->dentry->d_inode->i_op->default_file_ops-> 8120 mmap) 8121 goto out; 8122 8123 /* Now read in all of the header information */ 8124 8125 retval = -ENOMEM; 8126 size = elf_ex.e_phentsize * elf_ex.e_phnum; 8127 elf_phdata = 8128 (struct elf_phdr *) kmalloc(size, GFP_KERNEL); 8129 if (!elf_phdata) 8130 goto out; 8131 8132 retval = read_exec(bprm->dentry, elf_ex.e_phoff, 8133 (char *) elf_phdata, size, 1); 8134 if (retval < 0) 8135 goto out_free_ph; 8136 8137 retval = open_dentry(bprm->dentry, O_RDONLY); 8138 if (retval < 0) 8139 goto out_free_ph; 8140 elf_exec_fileno = retval; 8141 file = fget(elf_exec_fileno); 8142 8143 elf_ppnt = elf_phdata; 8144 elf_bss = 0; 8145 elf_brk = 0; 8146 8147 start_code = ~0UL; 8148 end_code = 0; 8149 end_data = 0; 8150 8151 for (i = 0; i < elf_ex.e_phnum; i++) { 8152 if (elf_ppnt->p_type == PT_INTERP) { 8153 retval = -EINVAL; 8154 if (elf_interpreter) 8155 goto out_free_interp; 8156 8157 /* This is the program interpreter used for 8158 * shared libraries - for now assume that this 8159 * is an a.out format binary 8160 */ 8161 8162 retval = -ENOMEM; 8163 elf_interpreter = 8164 (char *) kmalloc(elf_ppnt->p_filesz, GFP_KERNEL); 8165 if (!elf_interpreter) 8166 goto out_free_file; 8167 8168 retval = read_exec(bprm->dentry, 8169 elf_ppnt->p_offset, 8170 elf_interpreter, 8171 elf_ppnt->p_filesz, 1); 8172 if (retval < 0) 8173 goto out_free_interp; 8174 /* If the program interpreter is one of these two, 8175 * then assume an iBCS2 image. Otherwise assume 8176 * a native linux image. 8177 */ 8178 if (!strcmp(elf_interpreter,"/usr/lib/libc.so.1") 8179 || !strcmp(elf_interpreter,"/usr/lib/ld.so.1")) 8180 ibcs2_interpreter = 1; 8181 #if 0 8182 printk("Using ELF interpreter %s\n", 8183 elf_interpreter); 8184 #endif 8185 old_fs = get_fs(); /* Could probably be optimized*/ 8186 set_fs(get_ds()); 8187 #ifdef __sparc__ 8188 if (ibcs2_interpreter) { 8189 unsigned long old_pers = current->personality; 8190 8191 current->personality = PER_SVR4; 8192 interpreter_dentry = open_namei(elf_interpreter, 8193 0, 0); 8194 current->personality = old_pers; 8195 } else 8196 #endif 8197 interpreter_dentry = open_namei(elf_interpreter, 8198 0, 0); 8199 set_fs(old_fs); 8200 retval = PTR_ERR(interpreter_dentry); 8201 if (IS_ERR(interpreter_dentry)) 8202 goto out_free_interp; 8203 retval = permission(interpreter_dentry->d_inode, 8204 MAY_EXEC); 8205 if (retval < 0) 8206 goto out_free_dentry; 8207 retval = read_exec(interpreter_dentry, 0, 8208 bprm->buf, 128, 1); 8209 if (retval < 0) 8210 goto out_free_dentry; 8211 8212 /* Get the exec headers */ 8213 interp_ex = *((struct exec *) bprm->buf); 8214 interp_elf_ex = *((struct elfhdr *) bprm->buf); 8215 } 8216 elf_ppnt++; 8217 } 8218 8219 /* Some simple consistency checks for the interpreter*/ 8220 if (elf_interpreter) { 8221 interpreter_type = INTERPRETER_ELF|INTERPRETER_AOUT; 8222 8223 /* Now figure out which format our binary is */ 8224 if ((N_MAGIC(interp_ex) != OMAGIC) && 8225 (N_MAGIC(interp_ex) != ZMAGIC) && 8226 (N_MAGIC(interp_ex) != QMAGIC)) 8227 interpreter_type = INTERPRETER_ELF; 8228 8229 if (interp_elf_ex.e_ident[0] != 0x7f || 8230 strncmp(&interp_elf_ex.e_ident[1], "ELF", 3)) 8231 interpreter_type &= ~INTERPRETER_ELF; 8232 8233 retval = -ELIBBAD; 8234 if (!interpreter_type) 8235 goto out_free_dentry; 8236 8237 /* Make sure only one type was selected */ 8238 if ((interpreter_type & INTERPRETER_ELF) && 8239 interpreter_type != INTERPRETER_ELF) { 8240 printk(KERN_WARNING 8241 "ELF: Ambiguous type, using ELF\n"); 8242 interpreter_type = INTERPRETER_ELF; 8243 } 8244 } 8245 8246 /* OK, we are done with that, now set up the arg stuff, 8247 and then start this sucker up */ 8248 8249 if (!bprm->sh_bang) { 8250 char * passed_p; 8251 8252 if (interpreter_type == INTERPRETER_AOUT) { 8253 sprintf(passed_fileno, "%d", elf_exec_fileno); 8254 passed_p = passed_fileno; 8255 8256 if (elf_interpreter) { 8257 bprm->p = copy_strings(1, &passed_p, bprm->page, 8258 bprm->p, 2); 8259 bprm->argc++; 8260 } 8261 } 8262 retval = -E2BIG; 8263 if (!bprm->p) 8264 goto out_free_dentry; 8265 } 8266 8267 /* Flush all traces of the currently running exe */ 8268 retval = flush_old_exec(bprm); 8269 if (retval) 8270 goto out_free_dentry; 8271 8272 /* OK, This is the point of no return */ 8273 current->mm->end_data = 0; 8274 current->mm->end_code = 0; 8275 current->mm->mmap = NULL; 8276 current->flags &= ~PF_FORKNOEXEC; 8277 elf_entry = (unsigned long) elf_ex.e_entry; 8278 8279 /* Do this immediately, since STACK_TOP as used in 8280 setup_arg_pages may depend on the personality. */ 8281 SET_PERSONALITY(elf_ex, ibcs2_interpreter); 8282 8283 /* Do this so that we can load the interpreter, if need 8284 be. We will change some of these later */ 8285 current->mm->rss = 0; 8286 bprm->p = setup_arg_pages(bprm->p, bprm); 8287 current->mm->start_stack = bprm->p; 8288 8289 /* Try and get dynamic programs out of the way of the 8290 default mmap base, as well as whatever program they 8291 might try to exec. This is because the brk will 8292 follow the loader, and is not movable. */ 8293 8294 load_bias = ELF_PAGESTART(elf_ex.e_type == ET_DYN 8295 ? ELF_ET_DYN_BASE : 0); 8296 8297 /* Now we do a little grungy work by mmaping the ELF 8298 image into the correct location in memory. At this 8299 point, we assume that the image should be loaded at 8300 fixed address, not at a variable address. */ 8301 8302 old_fs = get_fs(); 8303 set_fs(get_ds()); 8304 for(i = 0, elf_ppnt = elf_phdata; i < elf_ex.e_phnum; 8305 i++, elf_ppnt++) { 8306 int elf_prot = 0, elf_flags; 8307 unsigned long vaddr; 8308 8309 if (elf_ppnt->p_type != PT_LOAD) 8310 continue; 8311 8312 if (elf_ppnt->p_flags & PF_R) elf_prot |= PROT_READ; 8313 if (elf_ppnt->p_flags & PF_W) elf_prot |= PROT_WRITE; 8314 if (elf_ppnt->p_flags & PF_X) elf_prot |= PROT_EXEC; 8315 8316 elf_flags = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE; 8317 8318 vaddr = elf_ppnt->p_vaddr; 8319 if (elf_ex.e_type == ET_EXEC || load_addr_set) { 8320 elf_flags |= MAP_FIXED; 8321 } 8322 8323 error = do_mmap(file, 8324 ELF_PAGESTART(load_bias + vaddr), 8325 (elf_ppnt->p_filesz + 8326 ELF_PAGEOFFSET(elf_ppnt->p_vaddr)), 8327 elf_prot, elf_flags, (elf_ppnt->p_offset - 8328 ELF_PAGEOFFSET(elf_ppnt->p_vaddr))); 8329 8330 if (!load_addr_set) { 8331 load_addr_set = 1; 8332 load_addr = 8333 (elf_ppnt->p_vaddr - elf_ppnt->p_offset); 8334 if (elf_ex.e_type == ET_DYN) { 8335 load_bias += error - 8336 ELF_PAGESTART(load_bias + vaddr); 8337 load_addr += error; 8338 } 8339 } 8340 k = elf_ppnt->p_vaddr; 8341 if (k < start_code) start_code = k; 8342 k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz; 8343 if (k > elf_bss) 8344 elf_bss = k; 8345 if ((elf_ppnt->p_flags & PF_X) && end_code < k) 8346 end_code = k; 8347 if (end_data < k) 8348 end_data = k; 8349 k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz; 8350 if (k > elf_brk) 8351 elf_brk = k; 8352 } 8353 set_fs(old_fs); 8354 fput(file); /* all done with the file */ 8355 8356 elf_entry += load_bias; 8357 elf_bss += load_bias; 8358 elf_brk += load_bias; 8359 start_code += load_bias; 8360 end_code += load_bias; 8361 end_data += load_bias; 8362 8363 if (elf_interpreter) { 8364 if (interpreter_type == INTERPRETER_AOUT) 8365 elf_entry = load_aout_interp(&interp_ex, 8366 interpreter_dentry); 8367 else 8368 elf_entry = load_elf_interp(&interp_elf_ex, 8369 interpreter_dentry, 8370 &interp_load_addr); 8371 8372 dput(interpreter_dentry); 8373 kfree(elf_interpreter); 8374 8375 if (elf_entry == ~0UL) { 8376 printk(KERN_ERR "Unable to load interpreter\n"); 8377 kfree(elf_phdata); 8378 send_sig(SIGSEGV, current, 0); 8379 return 0; 8380 } 8381 } 8382 8383 kfree(elf_phdata); 8384 8385 if (interpreter_type != INTERPRETER_AOUT) 8386 sys_close(elf_exec_fileno); 8387 8388 if (current->exec_domain && 8389 current->exec_domain->module) 8390 __MOD_DEC_USE_COUNT(current->exec_domain->module); 8391 if (current->binfmt && current->binfmt->module) 8392 __MOD_DEC_USE_COUNT(current->binfmt->module); 8393 current->exec_domain = 8394 lookup_exec_domain(current->personality); 8395 current->binfmt = &elf_format; 8396 if (current->exec_domain && 8397 current->exec_domain->module) 8398 __MOD_INC_USE_COUNT(current->exec_domain->module); 8399 if (current->binfmt && current->binfmt->module) 8400 __MOD_INC_USE_COUNT(current->binfmt->module); 8401 8402 #ifndef VM_STACK_FLAGS 8403 current->executable = dget(bprm->dentry); 8404 #endif 8405 compute_creds(bprm); 8406 current->flags &= ~PF_FORKNOEXEC; 8407 bprm->p = (unsigned long) 8408 create_elf_tables((char *)bprm->p, 8409 bprm->argc, 8410 bprm->envc, 8411 (interpreter_type == INTERPRETER_ELF 8412 ? &elf_ex : NULL), 8413 load_addr, load_bias, 8414 interp_load_addr, 8415 (interpreter_type == INTERPRETER_AOUT ? 0 : 1)); 8416 /* N.B. passed_fileno might not be initialized? */ 8417 if (interpreter_type == INTERPRETER_AOUT) 8418 current->mm->arg_start += strlen(passed_fileno) + 1; 8419 current->mm->start_brk = current->mm->brk = elf_brk; 8420 current->mm->end_code = end_code; 8421 current->mm->start_code = start_code; 8422 current->mm->end_data = end_data; 8423 current->mm->start_stack = bprm->p; 8424 8425 /* Calling set_brk effectively mmaps the pages that we 8426 * need for the bss and break sections */ 8427 set_brk(elf_bss, elf_brk); 8428 8429 padzero(elf_bss); 8430 8431 #if 0 8432 printk("(start_brk) %x\n" , current->mm->start_brk); 8433 printk("(end_code) %x\n" , current->mm->end_code); 8434 printk("(start_code) %x\n" , current->mm->start_code); 8435 printk("(end_data) %x\n" , current->mm->end_data); 8436 printk("(start_stack) %x\n", current->mm->start_stack); 8437 printk("(brk) %x\n" , current->mm->brk); 8438 #endif 8439 8440 if ( current->personality == PER_SVR4 ) 8441 { 8442 /* Why this, you ask??? Well SVr4 maps page 0 as 8443 read-only, and some applications "depend" upon 8444 this behavior. Since we do not have the power to 8445 recompile these, we emulate the SVr4 behavior. 8446 Sigh. */ 8447 /* N.B. Shouldn't the size here be PAGE_SIZE?? */ 8448 error = do_mmap(NULL, 0, 4096, PROT_READ | PROT_EXEC, 8449 MAP_FIXED | MAP_PRIVATE, 0); 8450 } 8451 8452 #ifdef ELF_PLAT_INIT 8453 /* The ABI may specify that certain registers be set up 8454 * in special ways (on i386 %edx is the address of a 8455 * DT_FINI function, for example. This macro performs 8456 * whatever initialization to the regs structure is 8457 * required. */ 8458 ELF_PLAT_INIT(regs); 8459 #endif 8460 8461 start_thread(regs, elf_entry, bprm->p); 8462 if (current->flags & PF_PTRACED) 8463 send_sig(SIGTRAP, current, 0); 8464 retval = 0; 8465 out: 8466 return retval; 8467 8468 /* error cleanup */ 8469 out_free_dentry: 8470 dput(interpreter_dentry); 8471 out_free_interp: 8472 if (elf_interpreter) 8473 kfree(elf_interpreter); 8474 out_free_file: 8475 fput(file); 8476 sys_close(elf_exec_fileno); 8477 out_free_ph: 8478 kfree(elf_phdata); 8479 goto out; 8480 } 8481 8482 static int 8483 load_elf_binary(struct linux_binprm * bprm, 8484 struct pt_regs * regs) 8485 { 8486 int retval; 8487 8488 MOD_INC_USE_COUNT; 8489 retval = do_load_elf_binary(bprm, regs); 8490 MOD_DEC_USE_COUNT; 8491 return retval; 8492 } 8493 8494 /* This is really simpleminded and specialized - we are 8495 loading an a.out library that is given an ELF 8496 header. */ 8497 8498 static inline int 8499 do_load_elf_library(int fd) 8500 { 8501 struct file * file; 8502 struct dentry * dentry; 8503 struct inode * inode; 8504 struct elf_phdr *elf_phdata; 8505 unsigned long elf_bss = 0, bss, len, k; 8506 int retval, error, i, j; 8507 struct elfhdr elf_ex; 8508 loff_t offset = 0; 8509 8510 error = -EACCES; 8511 file = fget(fd); 8512 if (!file || !file->f_op) 8513 goto out; 8514 dentry = file->f_dentry; 8515 inode = dentry->d_inode; 8516 8517 /* seek to the beginning of the file */ 8518 error = -ENOEXEC; 8519 8520 /* N.B. save current DS?? */ 8521 set_fs(KERNEL_DS); 8522 retval = file->f_op->read(file, (char *) &elf_ex, 8523 sizeof(elf_ex), &offset); 8524 set_fs(USER_DS); 8525 if (retval != sizeof(elf_ex)) 8526 goto out_putf; 8527 8528 if (elf_ex.e_ident[0] != 0x7f || 8529 strncmp(&elf_ex.e_ident[1], "ELF", 3) != 0) 8530 goto out_putf; 8531 8532 /* First of all, some simple consistency checks */ 8533 if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 || 8534 !elf_check_arch(elf_ex.e_machine) || 8535 (!inode->i_op || 8536 !inode->i_op->default_file_ops->mmap)) 8537 goto out_putf; 8538 8539 /* Now read in all of the header information */ 8540 8541 j = sizeof(struct elf_phdr) * elf_ex.e_phnum; 8542 if (j > ELF_EXEC_PAGESIZE) 8543 goto out_putf; 8544 8545 error = -ENOMEM; 8546 elf_phdata = (struct elf_phdr *) kmalloc(j,GFP_KERNEL); 8547 if (!elf_phdata) 8548 goto out_putf; 8549 8550 /* N.B. check for error return?? */ 8551 retval = read_exec(dentry, elf_ex.e_phoff, 8552 (char *) elf_phdata, 8553 sizeof(struct elf_phdr) * elf_ex.e_phnum, 1); 8554 8555 error = -ENOEXEC; 8556 for (j = 0, i = 0; ip_type == PT_LOAD) j++; 8558 if (j != 1) 8559 goto out_free_ph; 8560 8561 while (elf_phdata->p_type != PT_LOAD) elf_phdata++; 8562 8563 /* Now use mmap to map the library into memory. */ 8564 error = do_mmap(file, 8565 ELF_PAGESTART(elf_phdata->p_vaddr), 8566 (elf_phdata->p_filesz + 8567 ELF_PAGEOFFSET(elf_phdata->p_vaddr)), 8568 PROT_READ | PROT_WRITE | PROT_EXEC, 8569 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, 8570 (elf_phdata->p_offset - 8571 ELF_PAGEOFFSET(elf_phdata->p_vaddr))); 8572 if (error != ELF_PAGESTART(elf_phdata->p_vaddr)) 8573 goto out_free_ph; 8574 8575 k = elf_phdata->p_vaddr + elf_phdata->p_filesz; 8576 if (k > elf_bss) 8577 elf_bss = k; 8578 padzero(elf_bss); 8579 8580 len = ELF_PAGESTART(elf_phdata->p_filesz + 8581 elf_phdata->p_vaddr + 8582 ELF_EXEC_PAGESIZE - 1); 8583 bss = elf_phdata->p_memsz + elf_phdata->p_vaddr; 8584 if (bss > len) 8585 do_mmap(NULL, len, bss - len, 8586 PROT_READ|PROT_WRITE|PROT_EXEC, 8587 MAP_FIXED|MAP_PRIVATE, 0); 8588 error = 0; 8589 8590 out_free_ph: 8591 kfree(elf_phdata); 8592 out_putf: 8593 fput(file); 8594 out: 8595 return error; 8596 } 8597 8598 static int load_elf_library(int fd) 8599 { 8600 int retval; 8601 8602 MOD_INC_USE_COUNT; 8603 retval = do_load_elf_library(fd); 8604 MOD_DEC_USE_COUNT; 8605 return retval; 8606 } 8607 8608 /* Note that some platforms still use traditional core 8609 * dumps and not the ELF core dump. Each platform can 8610 * select it as appropriate. */ 8611 #ifdef USE_ELF_CORE_DUMP 8612 8613 /* ELF core dumper 8614 * 8615 * Modelled on fs/exec.c:aout_core_dump() 8616 * Jeremy Fitzhardinge 8617 */ 8618 /* These are the only things you should do on a 8619 * core-file: use only these functions to write out all 8620 * the necessary info. */ 8621 static int dump_write(struct file *file, 8622 const void *addr, int nr) 8623 { 8624 return file->f_op->write(file, addr, nr, &file->f_pos) 8625 == nr; 8626 } 8627 8628 static int dump_seek(struct file *file, off_t off) 8629 { 8630 if (file->f_op->llseek) { 8631 if (file->f_op->llseek(file, off, 0) != off) 8632 return 0; 8633 } else 8634 file->f_pos = off; 8635 return 1; 8636 } 8637 8638 /* Decide whether a segment is worth dumping; default is 8639 * yes to be sure (missing info is worse than too much; 8640 * etc). Personally I'd include everything, and use the 8641 * coredump limit... 8642 * 8643 * I think we should skip something. But I am not sure 8644 * how. H.J. */ 8645 static inline int maydump(struct vm_area_struct *vma) 8646 { 8647 if (!(vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC))) 8648 return 0; 8649 8650 /* Do not dump I/O mapped devices! -DaveM */ 8651 if(vma->vm_flags & VM_IO) 8652 return 0; 8653 #if 1 8654 if (vma->vm_flags & (VM_WRITE|VM_GROWSUP|VM_GROWSDOWN)) 8655 return 1; 8656 if (vma->vm_flags & (VM_READ|VM_EXEC|VM_EXECUTABLE| 8657 VM_SHARED)) 8658 return 0; 8659 #endif 8660 return 1; 8661 } 8662 8663 #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) 8664 8665 /* An ELF note in memory */ 8666 struct memelfnote 8667 { 8668 const char *name; 8669 int type; 8670 unsigned int datasz; 8671 void *data; 8672 }; 8673 8674 static int notesize(struct memelfnote *en) 8675 { 8676 int sz; 8677 8678 sz = sizeof(struct elf_note); 8679 sz += roundup(strlen(en->name), 4); 8680 sz += roundup(en->datasz, 4); 8681 8682 return sz; 8683 } 8684 8685 /* #define DEBUG */ 8686 8687 #ifdef DEBUG 8688 static void dump_regs(const char *str, elf_greg_t *r) 8689 { 8690 int i; 8691 static const char *regs[] = { 8692 "ebx", "ecx", "edx", "esi", "edi", "ebp", 8693 "eax", "ds", "es", "fs", "gs", 8694 "orig_eax", "eip", "cs", 8695 "efl", "uesp", "ss"}; 8696 printk("Registers: %s\n", str); 8697 8698 for(i = 0; i < ELF_NGREG; i++) 8699 { 8700 unsigned long val = r[i]; 8701 printk(" %-2d %-5s=%08lx %lu\n", 8702 i, regs[i], val, val); 8703 } 8704 } 8705 #endif 8706 8707 #define DUMP_WRITE(addr, nr) \ 8708 do { if (!dump_write(file, (addr), (nr))) return 0; } \ 8709 while(0) 8710 #define DUMP_SEEK(off) \ 8711 do { if (!dump_seek(file, (off))) return 0; } while(0) 8712 8713 static int writenote(struct memelfnote *men, 8714 struct file *file) 8715 { 8716 struct elf_note en; 8717 8718 en.n_namesz = strlen(men->name); 8719 en.n_descsz = men->datasz; 8720 en.n_type = men->type; 8721 8722 DUMP_WRITE(&en, sizeof(en)); 8723 DUMP_WRITE(men->name, en.n_namesz); 8724 /* XXX - cast from long long to long to avoid need for 8725 * libgcc.a */ 8726 /* XXX */ 8727 DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); 8728 DUMP_WRITE(men->data, men->datasz); 8729 /* XXX */ 8730 DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); 8731 8732 return 1; 8733 } 8734 #undef DUMP_WRITE 8735 #undef DUMP_SEEK 8736 8737 #define DUMP_WRITE(addr, nr) \ 8738 if (!dump_write(&file, (addr), (nr))) \ 8739 goto close_coredump; 8740 #define DUMP_SEEK(off) \ 8741 if (!dump_seek(&file, (off))) \ 8742 goto close_coredump; 8743 /* Actual dumper 8744 * 8745 * This is a two-pass process; first we find the offsets 8746 * of the bits, and then they are actually written out. 8747 * If we run out of core limit we just truncate. */ 8748 static int elf_core_dump(long signr, 8749 struct pt_regs * regs) 8750 { 8751 int has_dumped = 0; 8752 struct file file; 8753 struct dentry *dentry; 8754 struct inode *inode; 8755 mm_segment_t fs; 8756 char corefile[6+sizeof(current->comm)]; 8757 int segs; 8758 int i; 8759 size_t size; 8760 struct vm_area_struct *vma; 8761 struct elfhdr elf; 8762 off_t offset = 0, dataoff; 8763 unsigned long limit = 8764 current->rlim[RLIMIT_CORE].rlim_cur; 8765 int numnote = 4; 8766 struct memelfnote notes[4]; 8767 struct elf_prstatus prstatus; /* NT_PRSTATUS */ 8768 elf_fpregset_t fpu; /* NT_PRFPREG */ 8769 struct elf_prpsinfo psinfo; /* NT_PRPSINFO */ 8770 8771 if (!current->dumpable || 8772 limit < ELF_EXEC_PAGESIZE || 8773 atomic_read(¤t->mm->count) != 1) 8774 return 0; 8775 current->dumpable = 0; 8776 8777 #ifndef CONFIG_BINFMT_ELF 8778 MOD_INC_USE_COUNT; 8779 #endif 8780 8781 /* Count what's needed to dump, up to the limit of 8782 * coredump size */ 8783 segs = 0; 8784 size = 0; 8785 for(vma = current->mm->mmap; vma != NULL; 8786 vma = vma->vm_next) { 8787 if (maydump(vma)) 8788 { 8789 unsigned long sz = vma->vm_end-vma->vm_start; 8790 8791 if (size+sz >= limit) 8792 break; 8793 else 8794 size += sz; 8795 } 8796 8797 segs++; 8798 } 8799 #ifdef DEBUG 8800 printk("elf_core_dump: %d segs taking %d bytes\n", 8801 segs, size); 8802 #endif 8803 8804 /* Set up header */ 8805 memcpy(elf.e_ident, ELFMAG, SELFMAG); 8806 elf.e_ident[EI_CLASS] = ELF_CLASS; 8807 elf.e_ident[EI_DATA] = ELF_DATA; 8808 elf.e_ident[EI_VERSION] = EV_CURRENT; 8809 memset(elf.e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); 8810 8811 elf.e_type = ET_CORE; 8812 elf.e_machine = ELF_ARCH; 8813 elf.e_version = EV_CURRENT; 8814 elf.e_entry = 0; 8815 elf.e_phoff = sizeof(elf); 8816 elf.e_shoff = 0; 8817 elf.e_flags = 0; 8818 elf.e_ehsize = sizeof(elf); 8819 elf.e_phentsize = sizeof(struct elf_phdr); 8820 elf.e_phnum = segs+1; /* Include notes */ 8821 elf.e_shentsize = 0; 8822 elf.e_shnum = 0; 8823 elf.e_shstrndx = 0; 8824 8825 fs = get_fs(); 8826 set_fs(KERNEL_DS); 8827 memcpy(corefile,"core.",5); 8828 #if 0 8829 memcpy(corefile+5,current->comm,sizeof(current->comm)); 8830 #else 8831 corefile[4] = '\0'; 8832 #endif 8833 dentry = open_namei(corefile, 8834 O_CREAT | 2 | O_TRUNC | O_NOFOLLOW, 0600); 8835 if (IS_ERR(dentry)) { 8836 dentry = NULL; 8837 goto end_coredump; 8838 } 8839 inode = dentry->d_inode; 8840 8841 if(inode->i_nlink > 1) 8842 goto end_coredump; /* multiple links - don't dump */ 8843 8844 if (!S_ISREG(inode->i_mode)) 8845 goto end_coredump; 8846 if (!inode->i_op || !inode->i_op->default_file_ops) 8847 goto end_coredump; 8848 if (init_private_file(&file, dentry, 3)) 8849 goto end_coredump; 8850 if (!file.f_op->write) 8851 goto close_coredump; 8852 has_dumped = 1; 8853 current->flags |= PF_DUMPCORE; 8854 8855 DUMP_WRITE(&elf, sizeof(elf)); 8856 offset += sizeof(elf); /* Elf header */ 8857 /* Program headers */ 8858 offset += (segs+1) * sizeof(struct elf_phdr); 8859 8860 /* Set up the notes in similar form to SVR4 core dumps 8861 * made with info from their /proc. */ 8862 memset(&psinfo, 0, sizeof(psinfo)); 8863 memset(&prstatus, 0, sizeof(prstatus)); 8864 8865 notes[0].name = "CORE"; 8866 notes[0].type = NT_PRSTATUS; 8867 notes[0].datasz = sizeof(prstatus); 8868 notes[0].data = &prstatus; 8869 prstatus.pr_info.si_signo = prstatus.pr_cursig = signr; 8870 prstatus.pr_sigpend = current->signal.sig[0]; 8871 prstatus.pr_sighold = current->blocked.sig[0]; 8872 psinfo.pr_pid = prstatus.pr_pid = current->pid; 8873 psinfo.pr_ppid = prstatus.pr_ppid = 8874 current->p_pptr->pid; 8875 psinfo.pr_pgrp = prstatus.pr_pgrp = current->pgrp; 8876 psinfo.pr_sid = prstatus.pr_sid = current->session; 8877 prstatus.pr_utime.tv_sec = 8878 CT_TO_SECS(current->times.tms_utime); 8879 prstatus.pr_utime.tv_usec = 8880 CT_TO_USECS(current->times.tms_utime); 8881 prstatus.pr_stime.tv_sec = 8882 CT_TO_SECS(current->times.tms_stime); 8883 prstatus.pr_stime.tv_usec = 8884 CT_TO_USECS(current->times.tms_stime); 8885 prstatus.pr_cutime.tv_sec = 8886 CT_TO_SECS(current->times.tms_cutime); 8887 prstatus.pr_cutime.tv_usec = 8888 CT_TO_USECS(current->times.tms_cutime); 8889 prstatus.pr_cstime.tv_sec = 8890 CT_TO_SECS(current->times.tms_cstime); 8891 prstatus.pr_cstime.tv_usec = 8892 CT_TO_USECS(current->times.tms_cstime); 8893 8894 /* This transfers the registers from regs into the 8895 * standard coredump arrangement, whatever that is. */ 8896 #ifdef ELF_CORE_COPY_REGS 8897 ELF_CORE_COPY_REGS(prstatus.pr_reg, regs) 8898 #else 8899 if (sizeof(elf_gregset_t) != sizeof(struct pt_regs)) 8900 { 8901 printk("sizeof(elf_gregset_t) (%ld) != " 8902 "sizeof(struct pt_regs) (%ld)\n", 8903 (long)sizeof(elf_gregset_t), 8904 (long)sizeof(struct pt_regs)); 8905 } 8906 else 8907 *(struct pt_regs *)&prstatus.pr_reg = *regs; 8908 #endif 8909 8910 #ifdef DEBUG 8911 dump_regs("Passed in regs", (elf_greg_t *)regs); 8912 dump_regs("prstatus regs", 8913 (elf_greg_t *)&prstatus.pr_reg); 8914 #endif 8915 8916 notes[1].name = "CORE"; 8917 notes[1].type = NT_PRPSINFO; 8918 notes[1].datasz = sizeof(psinfo); 8919 notes[1].data = &psinfo; 8920 i = current->state ? ffz(~current->state) + 1 : 0; 8921 psinfo.pr_state = i; 8922 psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i]; 8923 psinfo.pr_zomb = psinfo.pr_sname == 'Z'; 8924 psinfo.pr_nice = current->priority-15; 8925 psinfo.pr_flag = current->flags; 8926 psinfo.pr_uid = current->uid; 8927 psinfo.pr_gid = current->gid; 8928 { 8929 int i, len; 8930 8931 set_fs(fs); 8932 8933 len = current->mm->arg_end - current->mm->arg_start; 8934 if (len >= ELF_PRARGSZ) 8935 len = ELF_PRARGSZ-1; 8936 copy_from_user(&psinfo.pr_psargs, 8937 (const char *)current->mm->arg_start, len); 8938 for(i = 0; i < len; i++) 8939 if (psinfo.pr_psargs[i] == 0) 8940 psinfo.pr_psargs[i] = ' '; 8941 psinfo.pr_psargs[len] = 0; 8942 8943 set_fs(KERNEL_DS); 8944 } 8945 strncpy(psinfo.pr_fname, current->comm, 8946 sizeof(psinfo.pr_fname)); 8947 8948 notes[2].name = "CORE"; 8949 notes[2].type = NT_TASKSTRUCT; 8950 notes[2].datasz = sizeof(*current); 8951 notes[2].data = current; 8952 8953 /* Try to dump the FPU. */ 8954 prstatus.pr_fpvalid = dump_fpu (regs, &fpu); 8955 if (!prstatus.pr_fpvalid) 8956 { 8957 numnote--; 8958 } 8959 else 8960 { 8961 notes[3].name = "CORE"; 8962 notes[3].type = NT_PRFPREG; 8963 notes[3].datasz = sizeof(fpu); 8964 notes[3].data = &fpu; 8965 } 8966 8967 /* Write notes phdr entry */ 8968 { 8969 struct elf_phdr phdr; 8970 int sz = 0; 8971 8972 for(i = 0; i < numnote; i++) 8973 sz += notesize(¬es[i]); 8974 8975 phdr.p_type = PT_NOTE; 8976 phdr.p_offset = offset; 8977 phdr.p_vaddr = 0; 8978 phdr.p_paddr = 0; 8979 phdr.p_filesz = sz; 8980 phdr.p_memsz = 0; 8981 phdr.p_flags = 0; 8982 phdr.p_align = 0; 8983 8984 offset += phdr.p_filesz; 8985 DUMP_WRITE(&phdr, sizeof(phdr)); 8986 } 8987 8988 /* Page-align dumped data */ 8989 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); 8990 8991 /* Write program headers for segments dump */ 8992 for(vma = current->mm->mmap, i = 0; 8993 i < segs && vma != NULL; vma = vma->vm_next) { 8994 struct elf_phdr phdr; 8995 size_t sz; 8996 8997 i++; 8998 8999 sz = vma->vm_end - vma->vm_start; 9000 9001 phdr.p_type = PT_LOAD; 9002 phdr.p_offset = offset; 9003 phdr.p_vaddr = vma->vm_start; 9004 phdr.p_paddr = 0; 9005 phdr.p_filesz = maydump(vma) ? sz : 0; 9006 phdr.p_memsz = sz; 9007 offset += phdr.p_filesz; 9008 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; 9009 if (vma->vm_flags & VM_WRITE) phdr.p_flags |= PF_W; 9010 if (vma->vm_flags & VM_EXEC) phdr.p_flags |= PF_X; 9011 phdr.p_align = ELF_EXEC_PAGESIZE; 9012 9013 DUMP_WRITE(&phdr, sizeof(phdr)); 9014 } 9015 9016 for(i = 0; i < numnote; i++) 9017 if (!writenote(¬es[i], &file)) 9018 goto close_coredump; 9019 9020 set_fs(fs); 9021 9022 DUMP_SEEK(dataoff); 9023 9024 for(i = 0, vma = current->mm->mmap; 9025 i < segs && vma != NULL; 9026 vma = vma->vm_next) { 9027 unsigned long addr = vma->vm_start; 9028 unsigned long len = vma->vm_end - vma->vm_start; 9029 9030 i++; 9031 if (!maydump(vma)) 9032 continue; 9033 #ifdef DEBUG 9034 printk("elf_core_dump: writing %08lx %lx\n", 9035 addr, len); 9036 #endif 9037 DUMP_WRITE((void *)addr, len); 9038 } 9039 9040 if ((off_t) file.f_pos != offset) { 9041 /* Sanity check */ 9042 printk("elf_core_dump: file.f_pos (%ld) != " 9043 "offset (%ld)\n", 9044 (off_t) file.f_pos, offset); 9045 } 9046 9047 close_coredump: 9048 if (file.f_op->release) 9049 file.f_op->release(inode,&file); 9050 9051 end_coredump: 9052 set_fs(fs); 9053 dput(dentry); 9054 #ifndef CONFIG_BINFMT_ELF 9055 MOD_DEC_USE_COUNT; 9056 #endif 9057 return has_dumped; 9058 } 9059 #endif /* USE_ELF_CORE_DUMP */ 9060 9061 int __init init_elf_binfmt(void) 9062 { 9063 return register_binfmt(&elf_format); 9064 } 9065 9066 #ifdef MODULE 9067 9068 int init_module(void) 9069 { 9070 /* Install the COFF, ELF and XOUT loaders. N.B. We 9071 * *rely* on the table being the right size with the 9072 * right number of free slots... */ 9073 return init_elf_binfmt(); 9074 } 9075 9076 9077 void cleanup_module( void) 9078 { 9079 /* Remove the COFF and ELF loaders. */ 9080 unregister_binfmt(&elf_format); 9081 } 9082 #endif /* FILE: fs/binfmt_java.c */ 9083 /* 9084 * linux/fs/binfmt_java.c 9085 * 9086 * Copyright (C) 1996 Brian A. Lantz 9087 * derived from binfmt_script.c 9088 * 9089 * Simplified and modified to support binary java 9090 * interpreters by Tom May . 9091 */ 9092 9093 #include 9094 #include 9095 #include 9096 #include 9097 #include 9098 #include 9099 9100 #define _PATH_JAVA "/usr/bin/java" 9101 #define _PATH_APPLET "/usr/bin/appletviewer" 9102 9103 /* These paths can be modified with sysctl(). */ 9104 9105 char binfmt_java_interpreter[65] = _PATH_JAVA; 9106 char binfmt_java_appletviewer[65] = _PATH_APPLET; 9107 9108 static int do_load_java(struct linux_binprm *bprm, 9109 struct pt_regs *regs) 9110 { 9111 char *i_name; 9112 int len; 9113 int retval; 9114 struct dentry * dentry; 9115 unsigned char *ucp = (unsigned char *) bprm->buf; 9116 9117 if ((ucp[0] != 0xca) || (ucp[1] != 0xfe) || 9118 (ucp[2] != 0xba) || (ucp[3] != 0xbe)) 9119 return -ENOEXEC; 9120 9121 /* Fail if we're called recursively, e.g., the Java 9122 * interpreter is a java binary. */ 9123 if (bprm->java) 9124 return -ENOEXEC; 9125 9126 bprm->java = 1; 9127 9128 dput(bprm->dentry); 9129 bprm->dentry = NULL; 9130 9131 /* Set args: [0] the name of the java interpreter 9132 * [1] name of java class to execute, which 9133 * is the filename without the path and 9134 * without trailing ".class". Note that the 9135 * interpreter will use its own way to found 9136 * the class file (typically using 9137 * environment variable CLASSPATH), and may 9138 * in fact execute a different file from the 9139 * one we want. 9140 * 9141 * This is done in reverse order, because of how the 9142 * user environment and arguments are stored. */ 9143 remove_arg_zero(bprm); 9144 len = strlen (bprm->filename); 9145 if (len >= 6 && 9146 !strcmp(bprm->filename + len - 6, ".class")) 9147 bprm->filename[len - 6] = 0; 9148 if ((i_name = strrchr (bprm->filename, '/')) != NULL) 9149 i_name++; 9150 else 9151 i_name = bprm->filename; 9152 bprm->p = copy_strings(1, &i_name, bprm->page, 9153 bprm->p, 2); 9154 bprm->argc++; 9155 9156 i_name = binfmt_java_interpreter; 9157 bprm->p = copy_strings(1, &i_name, bprm->page, 9158 bprm->p, 2); 9159 bprm->argc++; 9160 9161 if (!bprm->p) 9162 return -E2BIG; 9163 /* OK, now restart the process with the interpreter's 9164 * dentry. */ 9165 bprm->filename = binfmt_java_interpreter; 9166 dentry = open_namei(binfmt_java_interpreter, 0, 0); 9167 retval = PTR_ERR(dentry); 9168 if (IS_ERR(dentry)) 9169 return retval; 9170 9171 bprm->dentry = dentry; 9172 retval = prepare_binprm(bprm); 9173 if (retval < 0) 9174 return retval; 9175 9176 return search_binary_handler(bprm,regs); 9177 } 9178 9179 static int do_load_applet(struct linux_binprm *bprm, 9180 struct pt_regs *regs) 9181 { 9182 char *i_name; 9183 struct dentry * dentry; 9184 int retval; 9185 9186 if (strncmp (bprm->buf, "