/* prepage.c: A pre-page system call for Linux.
   This file implements prefetching of virtual memory pages for the Linux OS.
   Pre-fetching of pages is a flexible version of asynchronous I/O.  A list of
   addresses/extents tuples specifies virtual memory pages to be resolved.  The
   kernel makes a best-effort attempt to bring those pages into physical
   memory.

   Errata: this implementation only uses the base address, and not the
   range specification of the tuple.
   */

static const char *version = "prepage.c:v1.00 \n";

/*
  Theory of Operation

  We accept of list of address/extent tuples, and start reading in the VM pages
  associated with each tuple.
  This is a best-effort attempt.  Failed I/O attempts are later resolved by
  the normal page fault mechanism pages, so we have the flexibility to have
  a direct, high-performance code path.
  The semantics of prepaging a invalid region are similar to reading that
  region: they may, but are not assured to, generate a SIGSEGV or a SIGBUS
  (the latter for a mmap()ed shared page).
  Alternate sematics of ignoring prepaging of invalid regions is equally valid,
  but is not deemed useful.
  */

#include <linux/sys.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <asm/pgtable.h>

/* Variables that can be set when loading the module: */
/* Turn on debugging information. */
static int debug = 1;

/* The system call number we attempt to install ourselves as. */
static int syscall_num = 165;

struct prepage_tuple {
  caddr_t addr;
  size_t extent;
};

static void prefetch_region(struct task_struct *tsk, struct vm_area_struct *vma,
						  unsigned long address, size_t extent, int write_access);

asmlinkage int sys_prepage(int magic, struct prepage_tuple *tlist, int arg3)
{
  struct task_struct *tsk = current;
  struct mm_struct *mm = tsk->mm;
  struct vm_area_struct * vma;
  int error, i = 3;
  
  if (debug)
	printk("Prepage request entry, args are %8.8x %8.8x %8.8x.\n",
		   magic, (int)tlist, arg3);

  while (--i > 0) {
    unsigned long address;
    size_t extent;

    if ((error = verify_area(VERIFY_READ, tlist,
			     sizeof(struct prepage_tuple))))
      return error;
    address = (unsigned long) get_user(&tlist->addr);
	if (address == 0)
	  break;
    extent = get_user(&tlist->extent);
	if (debug)
	  printk("Prepage request at %8.8lx for %d bytes.\n", address, extent);

	vma = find_vma_intersection(mm, address, address + extent);
	/* do_no_page efficiently checks for pages that already exists. */
	if (vma && (vma->vm_flags & (VM_READ | VM_EXEC)))
	  prefetch_region(tsk, vma, address, extent, 0);
	tlist++;
  }
  return 0;
}



/* See /usr/src/linux/arch/i386/kernel/head.S for values. */
const empty_bad_page = 0x3000;
const empty_bad_page_table = 0x4000;

pte_t * __bad_pagetable(void)
{

	__asm__ __volatile__("cld ; rep ; stosl":
		:"a" (pte_val(BAD_PAGE)),
		 "D" ((long) empty_bad_page_table),
		 "c" (PAGE_SIZE/4)
		:"di","cx");
	return (pte_t *) empty_bad_page_table;
}

pte_t __bad_page(void)
{
	__asm__ __volatile__("cld ; rep ; stosl":
		:"a" (0),
		 "D" ((long) empty_bad_page),
		 "c" (PAGE_SIZE/4)
		:"di","cx");
	return pte_mkdirty(mk_pte((unsigned long) empty_bad_page, PAGE_SHARED));
}

/* Implement functionality similar to do_no_page().
   We could check enough error conditions to use it directly, but the point
   is moot as it's not exported for module use.  This simplifies the semantics
   for invalid accesses anyway...
   */
/*
 * prefetch_page() tries to create a new page mapping. It aggressively
 * tries to share with existing pages, but makes a separate copy if
 * the "write_access" parameter is true in order to avoid the next
 * page fault.
 */

static void prefetch_region(struct task_struct *tsk, struct vm_area_struct *vma,
						  unsigned long address, size_t extent, int write_access)
{
	pgd_t * pgd;
	pmd_t * pmd;
	pte_t * page_table;
	pte_t entry;
	unsigned long page;

	pgd = pgd_offset(tsk->mm, address);
	pmd = pmd_alloc(pgd, address);
	if (!pmd)
		goto no_memory;
	page_table = pte_alloc(pmd, address);
	if (!page_table)
		goto no_memory;
	entry = *page_table;
	if (pte_present(entry))
		goto is_present;
	if (!pte_none(entry))
		goto swap_page;			/* Currently being swapped out. */
	address &= PAGE_MASK;
	if (!vma->vm_ops || !vma->vm_ops->nopage)
		goto anonymous_page;
	/*
	 * The third argument is "no_share", which tells the low-level code
	 * to copy, not share the page even if sharing is possible.  It's
	 * essentially an early COW detection 
	 */
	page = vma->vm_ops->nopage(vma, address,
		(vma->vm_flags & VM_SHARED)?0:write_access);
	if (!page)
		goto sigbus;
	++tsk->maj_flt;				/* Increment major fault counter, */
	++vma->vm_mm->rss;			/* and the resident set size. */
	/*
	 * This silly early PAGE_DIRTY setting removes a race
	 * due to the bad i386 page protection. But it's valid
	 * for other architectures too.
	 *
	 * Note that if write_access is true, we either now have
	 * a exclusive copy of the page, or this is a shared mapping,
	 * so we can make it writable and dirty to avoid having to
	 * handle that later.
	 */
	flush_page_to_ram(page);	/* No-op on cache-coherent Intel. */
	entry = mk_pte(page, vma->vm_page_prot);
	if (write_access) {
		entry = pte_mkwrite(pte_mkdirty(entry));
	} else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
		entry = pte_wrprotect(entry);
	if (!pte_none(*page_table))
		free_page(pte_page(entry));
	else
	  set_pte(page_table, entry);
	/* no need to invalidate: a not-present page shouldn't be cached */
	return;

anonymous_page:
	/* We don't actually allocate anonymous pages. */
	return;

swap_page:
	/* We get here if the page is being swapped out.  We handle this case
	   only if it's easy to do so. */
#ifndef MODULE 
	do_swap_page(tsk, vma, address, page_table, entry, write_access);
#else
	if (!vma->vm_ops || !vma->vm_ops->swapin) {
	  /* We don't retrieve pages from anonymous swap yet. */
#ifdef notdef
	  swap_in(tsk, vma, page_table, pte_val(entry), write_access);
	  flush_page_to_ram(pte_page(*page_table));
#endif
	} else {
	  pte_t page = vma->vm_ops->swapin(vma,
									   address - vma->vm_start + vma->vm_offset,
									   pte_val(entry));
	  if (pte_val(*page_table) != pte_val(entry)) {
		free_page(pte_page(page));
	  } else {
		if (mem_map[MAP_NR(pte_page(page))].count > 1
			&& !(vma->vm_flags & VM_SHARED))
		  page = pte_wrprotect(page);
		++vma->vm_mm->rss;
		++tsk->maj_flt;
		flush_page_to_ram(pte_page(page));
		set_pte(page_table, page);
	  }
	}
#endif
	return;

sigbus:
no_memory:
is_present:
	return;
}

extern int sys_call_table[];

#ifdef MODULE
int init_module(void)
{

  printk(version);

  if (sys_call_table[syscall_num]) {
	printk("The requested sys_call_table slot %d is already used!\n",
		   syscall_num);
	return 1;
  }

  sys_call_table[syscall_num] = (int)sys_prepage;

  return 0;
}

void
cleanup_module(void)
{
  sys_call_table[syscall_num] = 0;
}

#endif /* MODULE */


/*
 * Local variables:
 *  compile-command: "gcc -DMODULE -D__KERNEL__ -Wall -Wstrict-prototypes -O6 -c prepage.c"
 *  c-indent-level: 4
 *  tab-width: 4
 * End:
 */



