Logo Search packages:      
Sourcecode: linux version File versions

fault.c

/*
 *  PowerPC version
 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
 *
 *  Derived from "arch/i386/mm/fault.c"
 *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Modified by Cort Dougan and Paul Mackerras.
 *
 *  Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version
 *  2 of the License, or (at your option) any later version.
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>

#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/siginfo.h>


#ifdef CONFIG_KPROBES
static inline int notify_page_fault(struct pt_regs *regs)
{
      int ret = 0;

      /* kprobe_running() needs smp_processor_id() */
      if (!user_mode(regs)) {
            preempt_disable();
            if (kprobe_running() && kprobe_fault_handler(regs, 11))
                  ret = 1;
            preempt_enable();
      }

      return ret;
}
#else
static inline int notify_page_fault(struct pt_regs *regs)
{
      return 0;
}
#endif

/*
 * Check whether the instruction at regs->nip is a store using
 * an update addressing form which will update r1.
 */
static int store_updates_sp(struct pt_regs *regs)
{
      unsigned int inst;

      if (get_user(inst, (unsigned int __user *)regs->nip))
            return 0;
      /* check for 1 in the rA field */
      if (((inst >> 16) & 0x1f) != 1)
            return 0;
      /* check major opcode */
      switch (inst >> 26) {
      case 37:    /* stwu */
      case 39:    /* stbu */
      case 45:    /* sthu */
      case 53:    /* stfsu */
      case 55:    /* stfdu */
            return 1;
      case 62:    /* std or stdu */
            return (inst & 3) == 1;
      case 31:
            /* check minor opcode */
            switch ((inst >> 1) & 0x3ff) {
            case 181:   /* stdux */
            case 183:   /* stwux */
            case 247:   /* stbux */
            case 439:   /* sthux */
            case 695:   /* stfsux */
            case 759:   /* stfdux */
                  return 1;
            }
      }
      return 0;
}

/*
 * For 600- and 800-family processors, the error_code parameter is DSISR
 * for a data fault, SRR1 for an instruction fault. For 400-family processors
 * the error_code parameter is ESR for a data fault, 0 for an instruction
 * fault.
 * For 64-bit processors, the error_code parameter is
 *  - DSISR for a non-SLB data access fault,
 *  - SRR1 & 0x08000000 for a non-SLB instruction access fault
 *  - 0 any SLB fault.
 *
 * The return value is 0 if the fault was handled, or the signal
 * number if this is a kernel fault that can't be handled here.
 */
int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
                      unsigned long error_code)
{
      struct vm_area_struct * vma;
      struct mm_struct *mm = current->mm;
      siginfo_t info;
      int code = SEGV_MAPERR;
      int is_write = 0, ret;
      int trap = TRAP(regs);
      int is_exec = trap == 0x400;

#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
      /*
       * Fortunately the bit assignments in SRR1 for an instruction
       * fault and DSISR for a data fault are mostly the same for the
       * bits we are interested in.  But there are some bits which
       * indicate errors in DSISR but can validly be set in SRR1.
       */
      if (trap == 0x400)
            error_code &= 0x48200000;
      else
            is_write = error_code & DSISR_ISSTORE;
#else
      is_write = error_code & ESR_DST;
#endif /* CONFIG_4xx || CONFIG_BOOKE */

      if (notify_page_fault(regs))
            return 0;

      if (unlikely(debugger_fault_handler(regs)))
            return 0;

      /* On a kernel SLB miss we can only check for a valid exception entry */
      if (!user_mode(regs) && (address >= TASK_SIZE))
            return SIGSEGV;

#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
      if (error_code & DSISR_DABRMATCH) {
            /* DABR match */
            do_dabr(regs, address, error_code);
            return 0;
      }
#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/

      if (in_atomic() || mm == NULL) {
            if (!user_mode(regs))
                  return SIGSEGV;
            /* in_atomic() in user mode is really bad,
               as is current->mm == NULL. */
            printk(KERN_EMERG "Page fault in user mode with "
                   "in_atomic() = %d mm = %p\n", in_atomic(), mm);
            printk(KERN_EMERG "NIP = %lx  MSR = %lx\n",
                   regs->nip, regs->msr);
            die("Weird page fault", regs, SIGSEGV);
      }

      /* When running in the kernel we expect faults to occur only to
       * addresses in user space.  All other faults represent errors in the
       * kernel and should generate an OOPS.  Unfortunately, in the case of an
       * erroneous fault occurring in a code path which already holds mmap_sem
       * we will deadlock attempting to validate the fault against the
       * address space.  Luckily the kernel only validly references user
       * space from well defined areas of code, which are listed in the
       * exceptions table.
       *
       * As the vast majority of faults will be valid we will only perform
       * the source reference check when there is a possibility of a deadlock.
       * Attempt to lock the address space, if we cannot we then validate the
       * source.  If this is invalid we can skip the address space check,
       * thus avoiding the deadlock.
       */
      if (!down_read_trylock(&mm->mmap_sem)) {
            if (!user_mode(regs) && !search_exception_tables(regs->nip))
                  goto bad_area_nosemaphore;

            down_read(&mm->mmap_sem);
      }

      vma = find_vma(mm, address);
      if (!vma)
            goto bad_area;
      if (vma->vm_start <= address)
            goto good_area;
      if (!(vma->vm_flags & VM_GROWSDOWN))
            goto bad_area;

      /*
       * N.B. The POWER/Open ABI allows programs to access up to
       * 288 bytes below the stack pointer.
       * The kernel signal delivery code writes up to about 1.5kB
       * below the stack pointer (r1) before decrementing it.
       * The exec code can write slightly over 640kB to the stack
       * before setting the user r1.  Thus we allow the stack to
       * expand to 1MB without further checks.
       */
      if (address + 0x100000 < vma->vm_end) {
            /* get user regs even if this fault is in kernel mode */
            struct pt_regs *uregs = current->thread.regs;
            if (uregs == NULL)
                  goto bad_area;

            /*
             * A user-mode access to an address a long way below
             * the stack pointer is only valid if the instruction
             * is one which would update the stack pointer to the
             * address accessed if the instruction completed,
             * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
             * (or the byte, halfword, float or double forms).
             *
             * If we don't check this then any write to the area
             * between the last mapped region and the stack will
             * expand the stack rather than segfaulting.
             */
            if (address + 2048 < uregs->gpr[1]
                && (!user_mode(regs) || !store_updates_sp(regs)))
                  goto bad_area;
      }
      if (expand_stack(vma, address))
            goto bad_area;

good_area:
      code = SEGV_ACCERR;
#if defined(CONFIG_6xx)
      if (error_code & 0x95700000)
            /* an error such as lwarx to I/O controller space,
               address matching DABR, eciwx, etc. */
            goto bad_area;
#endif /* CONFIG_6xx */
#if defined(CONFIG_8xx)
        /* The MPC8xx seems to always set 0x80000000, which is
         * "undefined".  Of those that can be set, this is the only
         * one which seems bad.
         */
      if (error_code & 0x10000000)
                /* Guarded storage error. */
            goto bad_area;
#endif /* CONFIG_8xx */

      if (is_exec) {
#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
            /* protection fault */
            if (error_code & DSISR_PROTFAULT)
                  goto bad_area;
            /*
             * Allow execution from readable areas if the MMU does not
             * provide separate controls over reading and executing.
             */
            if (!(vma->vm_flags & VM_EXEC) &&
                (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
                 !(vma->vm_flags & (VM_READ | VM_WRITE))))
                  goto bad_area;
#else
            pte_t *ptep;
            pmd_t *pmdp;

            /* Since 4xx/Book-E supports per-page execute permission,
             * we lazily flush dcache to icache. */
            ptep = NULL;
            if (get_pteptr(mm, address, &ptep, &pmdp)) {
                  spinlock_t *ptl = pte_lockptr(mm, pmdp);
                  spin_lock(ptl);
                  if (pte_present(*ptep)) {
                        struct page *page = pte_page(*ptep);

                        if (!test_bit(PG_arch_1, &page->flags)) {
                              flush_dcache_icache_page(page);
                              set_bit(PG_arch_1, &page->flags);
                        }
                        pte_update(ptep, 0, _PAGE_HWEXEC |
                                 _PAGE_ACCESSED);
                        _tlbie(address, mm->context.id);
                        pte_unmap_unlock(ptep, ptl);
                        up_read(&mm->mmap_sem);
                        return 0;
                  }
                  pte_unmap_unlock(ptep, ptl);
            }
#endif
      /* a write */
      } else if (is_write) {
            if (!(vma->vm_flags & VM_WRITE))
                  goto bad_area;
      /* a read */
      } else {
            /* protection fault */
            if (error_code & 0x08000000)
                  goto bad_area;
            if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
                  goto bad_area;
      }

      /*
       * If for any reason at all we couldn't handle the fault,
       * make sure we exit gracefully rather than endlessly redo
       * the fault.
       */
 survive:
      ret = handle_mm_fault(mm, vma, address, is_write);
      if (unlikely(ret & VM_FAULT_ERROR)) {
            if (ret & VM_FAULT_OOM)
                  goto out_of_memory;
            else if (ret & VM_FAULT_SIGBUS)
                  goto do_sigbus;
            BUG();
      }
      if (ret & VM_FAULT_MAJOR)
            current->maj_flt++;
      else
            current->min_flt++;
      up_read(&mm->mmap_sem);
      return 0;

bad_area:
      up_read(&mm->mmap_sem);

bad_area_nosemaphore:
      /* User mode accesses cause a SIGSEGV */
      if (user_mode(regs)) {
            _exception(SIGSEGV, regs, code, address);
            return 0;
      }

      if (is_exec && (error_code & DSISR_PROTFAULT)
          && printk_ratelimit())
            printk(KERN_CRIT "kernel tried to execute NX-protected"
                   " page (%lx) - exploit attempt? (uid: %d)\n",
                   address, current->uid);

      return SIGSEGV;

/*
 * We ran out of memory, or some other thing happened to us that made
 * us unable to handle the page fault gracefully.
 */
out_of_memory:
      up_read(&mm->mmap_sem);
      if (is_global_init(current)) {
            yield();
            down_read(&mm->mmap_sem);
            goto survive;
      }
      printk("VM: killing process %s\n", current->comm);
      if (user_mode(regs))
            do_group_exit(SIGKILL);
      return SIGKILL;

do_sigbus:
      up_read(&mm->mmap_sem);
      if (user_mode(regs)) {
            info.si_signo = SIGBUS;
            info.si_errno = 0;
            info.si_code = BUS_ADRERR;
            info.si_addr = (void __user *)address;
            force_sig_info(SIGBUS, &info, current);
            return 0;
      }
      return SIGBUS;
}

/*
 * bad_page_fault is called when we have a bad access from the kernel.
 * It is called from the DSI and ISI handlers in head.S and from some
 * of the procedures in traps.c.
 */
void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
{
      const struct exception_table_entry *entry;

      /* Are we prepared to handle this fault?  */
      if ((entry = search_exception_tables(regs->nip)) != NULL) {
            regs->nip = entry->fixup;
            return;
      }

      /* kernel has accessed a bad area */

      switch (regs->trap) {
      case 0x300:
      case 0x380:
            printk(KERN_ALERT "Unable to handle kernel paging request for "
                  "data at address 0x%08lx\n", regs->dar);
            break;
      case 0x400:
      case 0x480:
            printk(KERN_ALERT "Unable to handle kernel paging request for "
                  "instruction fetch\n");
            break;
      default:
            printk(KERN_ALERT "Unable to handle kernel paging request for "
                  "unknown fault\n");
            break;
      }
      printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
            regs->nip);

      die("Kernel access of bad area", regs, sig);
}

Generated by  Doxygen 1.6.0   Back to index