#include <linux/errno.h>
#include <linux/cn_memmon.h>
#include <linux/sched.h>
#include <linux/gfp.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <linux/highmem.h>

#include <asm/timex.h>


#ifdef __PPC64__
#define BATCH_FLUSH
#endif

/* The memmon "sister" page flags works like this:
 * At the lowest level, each page of memory has a corresponding
 * byte of flags called the "memmon flag entry".  These are grouped
 * into a page worth of flags, and together they make up a "memmon flag
 * table".  There are 4096 mfe's in an mft.  Each mft covers 4096^2 or
 * 16777216 bytes of address space.
 *
 * Each mft is pointed to by a "memmon flag directory entry", which is
 * stored on a separate page.  It only takes 256 mfde's to cover the entire
 * 32-bit address space.  These mfde's are grouped into the "memmon flag
 * directory".
 * 
 * Finally, a pointer to the "memmon flag directory" (if it exists) is
 * stored in the mm_struct for the memory map in question.
 *
 * To get from a memory address to the flags for that address, we do the 
 * following:
 * 1) Get the location the memmon flag directory.  (Note: it may not exist
 *    in which case there were no flags for that address.)
 * 2) Use the high 8 bits of the memory address as an offset within the
 *    memmon flag directory.
 * 3) Get the location of the memmon flag table.  (Note: it may not exist
 *    in which case there were no flags for that address.)
 * 4) Use the next 12 bits of the memory address as an offset within the 
 *    memmon flag table.
 * 5) The byte at that location are the flag bits for the address in question.
 */


#define MFD_SHIFT 24
#define PTRS_PER_MFD 256
#define MFD_MASK (PTRS_PER_MFD - 1)
#define mfde_offset(addr) (((addr) >> (MFD_SHIFT)) & MFD_MASK)

#define MFT_SHIFT 12
#define ENTRIES_PER_MFT 4096
#define MFT_MASK (ENTRIES_PER_MFT - 1)
#define mfte_offset(addr) (((addr) >> (MFT_SHIFT)) & MFT_MASK)

#define LAST_FLAG_TABLE_ADDR(addr) ((addr)|0xFFFFFF)

#define memmon_flag_dir(mm) ((mfdep_t *) &mm->memmon_flag_dir)
#define memmon_flag_dir_entry(mfdp, addr) (*mfdp + mfde_offset(addr))
#define memmon_flag_table_entry(mfdep, addr) (*mfdep + mfte_offset(addr))

typedef unsigned char mfte_t;
typedef mfte_t * mfde_t;
typedef mfde_t *mfdep_t;

static inline void memmon_set_flags(unsigned char *flagptr, unsigned char flags)
{
	*flagptr |= flags;
}

static inline void memmon_clr_flags(unsigned char *flagptr, unsigned char flags)
{
	*flagptr &= ~flags;
}

static inline int memmon_test_flags(unsigned char *flagptr, unsigned char flags)
{
	return (*flagptr & flags);
}


/* This routine allows us to set flag bits in the byte reserved for memmon
 * flags for the specified address range.
 *
 * "start" and "end" are pointers to userspace addresses, "ppage" is a pointer
 * to the address of a zeroed page of memory, and "flags" is a byte.
 *
 * We may need memory if it has not yet been allocated.  In this case there are
 * two possibilities: 1) If *ppage is 0, we return -ENOBUFS.  2) If *ppage is 
 * not NULL, we use the memory and set *ppage to 0.
 * 
 * Once we locate the current flags for the given address range, we OR "flags" 
 * with the current flags and update "start" to reflect the range of addresses
 * affected.
 *
 * We will only affect flags within a single flag table at a time.  If we 
 * updated the whole desired range, then we return 0.  If there were no errors
 * but we did not apply the flag to the whole range, we return 1.  If this
 * occurs, the caller should call us again.
 */

static int __set_address_flags(struct mm_struct *mm, unsigned int *start, 
	unsigned int *end, unsigned long *ppage, unsigned char flags)
{
	int ret;
	mfdep_t *mfdp;
	mfdep_t mfdep;
	mfte_t *tmp_mftep, *end_mftep;
	unsigned int last_addr, tempend;
		
	
	ret = -ENOBUFS;
	down_write(&mm->mmap_sem);
	
	mfdp = memmon_flag_dir(mm);
#ifdef DEBUG
	printk("*mfdp: %p\n", *mfdp);
#endif
	if (!*mfdp) {
		if (!*ppage) goto up_out;

		/* caller passed in a page, how nice of them */
		*mfdp = (mfdep_t) *ppage;
		*ppage = 0;
	}
	
	mfdep = memmon_flag_dir_entry(mfdp, *start);
#ifdef DEBUG
	printk("*mfdp: %p\n", *mfdp);
	printk("mfdep: %p\n", mfdep);
	printk("*mfdep: %p\n", *mfdep);
#endif
	if (!*mfdep) {
		if (!*ppage) goto up_out;
	 
		/* caller passed in a page, how nice of them */
		*mfdep = (mfde_t) *ppage;
		*ppage = 0;
	}
#ifdef DEBUG
	printk("*mfdep: %p\n", *mfdep);
#endif

	/* Each time we call this function, we can only handle address ranges
	 * that map to a single flag table.
	 *
	 * To ensure this, we set "tempend" to whichever is smaller, either
	 * "*end", or the last address that will map to the same flag table
	 * as "*start".
	 */
	last_addr = LAST_FLAG_TABLE_ADDR(*start);
	tempend = min(last_addr, *end);
	
	tmp_mftep = memmon_flag_table_entry(mfdep, *start);
	end_mftep = memmon_flag_table_entry(mfdep, tempend);
	
#ifdef DEBUG
	printk("tmp_mftep: %p\n", tmp_mftep);
	printk("end_mftep: %p\n", end_mftep);
#endif
	
	while (tmp_mftep <= end_mftep) {	
		memmon_set_flags(tmp_mftep, flags);
		tmp_mftep++;
	}
	
	/* Success on this call.  If we need to get called again,
	 * return 1.
	 */
	 
	*start = tempend + 1;
	ret = (tempend == *end) ? 0 : 1;

up_out:
	up_write(&mm->mmap_sem);
	return ret;
}

/* This is a helper routine primarily to handle memory use.
 *
 * __set_address_flag() will return -ENOBUFS if it needs a zeroed page of
 * memory.  If this occurs, we attempt to allocate such a page and call it
 * again.  If the function uses it then it will zero out the address of the
 * page.
 *
 * If we are racing with another cpu, it is possible that the memory may not
 * be used, in which case we must free it ourselves.
 *
 * Note that we may have to call the function up to three times in the worst
 * possible case.
 *
 * On failure we return -ENOMEM if we could not allocate enough memory.
 */

static int _set_address_flags(struct mm_struct *mm, unsigned int *start, 
		unsigned int *end, unsigned char flags)
{
	int ret;
	unsigned long page = 0;
#ifdef DEBUG
	printk("start: %x  end: %x\n", *start, *end);
#endif

	do {
		ret = __set_address_flags(mm, start, end, &page, flags);
		if (ret == -ENOBUFS) {
			page = get_zeroed_page(GFP_KERNEL);
			if (!page)
				ret = -ENOMEM;
		}
	} while (ret && (ret != -ENOMEM));

	if (page)
		free_page(page);
	return ret;
}




/* This routine allows us to set flag(s) in the byte reserved for memmon flags
 * for the specified address range. "args->start" and "args->end" are
 * value/result pointers to userspace addresses, and "args->flags" is the
 * flag pattern that is to be set.
 *
 * The values of the "args->start" and "args->end" parameters may be
 * overwritten, so the caller should not rely on them after calling this.
 *
 * On success we return 0, on failure we return -ENOMEM if we could not
 * allocate enough memory.
 * 
 * On any call, "args->start" will be updated such that the range from
 * "args->start" to "args->end" is the range that was *not* affected.  On
 * successful completion, "args->start" will be one past "args->end".
 */

int memmon_set_address_flags(struct mm_struct *mm, struct memmon_info *args)
{
	int ret = 0;

	/* On each loop through, the function will update the values of
	 * start and end to reflect the range of addresses that have not
	 * been affected.  We keep looping until done or we get an error.
	 */
	do {
		ret = _set_address_flags(mm, &args->start, &args->end, 
			args->flags);
	} while(ret > 0);
	return ret;
}




/* This routine allows us to clear a flag bit in the byte reserved for memmon
 * flags for the specified address range.
 *
 * "start" and "end" are pointers to userspace addresses, and "flagbit" is
 * the bit number of one of the flags listed at the top of this file. "ppage"
 * is a pointer to the address of a zeroed page of memory.
 *
 * On completion we set the bit flag on the given address range and update
 * "start" to reflect the range of addresses affected.
 *
 * Normally we will only affect a page worth of flags at a time.  (The
 * exception to this is if nothing was being monitored.)
 * If we updated the whole desired range, then we return 0.  If there were no
 * errors but we did not apply the flag to the whole range, we return 1.  If
 * this occurs, the caller should call us again.
 */

static int _clr_address_flags(struct mm_struct *mm, unsigned int *start, 
	unsigned int *end, unsigned char flags)
{
	int ret;
	mfdep_t *mfdp;
	mfdep_t mfdep;
	mfte_t *tmp_mftep, *end_mftep;
	unsigned int tempend;

	down_write(&mm->mmap_sem);

	mfdp = memmon_flag_dir(mm);
	if (!*mfdp) {
		/* Nothing is monitored, so shortcut any possible additional
		 * ranges that have to be cleared.
		 */
		tempend = *end;
		goto success_out;
	}
		
	/* Each time we call this function, we can only handle address ranges
	 * that map to a single L2 table.
	 *
	 * "tempend" is set to the last address that will map to the same
	 * L2 table as "*start".
	 *
	 * If "tempend" is after "*end", it means that
	 * "*end" maps to the same L2 table as "*start", so we then set
	 * "tempend" to "*end".
	 */
	tempend = LAST_FLAG_TABLE_ADDR(*start);
	if (tempend > *end)
		tempend = *end;
	
	/* We want to get a pointer to the appropriate entry
	 * in the L1 table.  This could be 0 if nothing has ever been
	 * monitored in that address range--which just means less work for us.
	 */
	mfdep = memmon_flag_dir_entry(mfdp, *start);
	if (!*mfdep) {
		/* Nothing in this particular range is being monitored, so no
		 * work to clear it.  "tempend" is already set, so just call
		 * it a success.
		 */
		 goto success_out;
	}

	/* Now we want to find the right entry range in the L2 table */
	tmp_mftep = memmon_flag_table_entry(mfdep, *start);
	end_mftep = memmon_flag_table_entry(mfdep, tempend);
	
	while (tmp_mftep <= end_mftep) {	
		memmon_clr_flags(tmp_mftep, flags);
		tmp_mftep++;
	}

success_out:	
	/* Success on this call.  If we need to get called again,
	 * return 1.
	 */
	 
	*start = tempend + 1;
	ret = (tempend == *end) ? 0 : 1;

	up_write(&mm->mmap_sem);
	return ret;
}

/* This routine allows us to clear one or more bits in the byte reserved for
 * memmon flags for the specified address.  "args->start" and "args->end" give
 * the range of addresses to be affected, and "args->flags" is the bits which
 * are to be cleared.
 *
 * There is no return code, since there is no reason for this to fail.
 *
 * It is *not* considered an error to try and clear a bit on an address
 * that is not currently being monitored.
 *
 * The caller should ensure that args->start is smaller than args->end.
 */

void memmon_clr_address_flags(struct mm_struct *mm, struct memmon_info *args)
{
	int rc;	

	/* On each loop through, the function will update the values of
	 * start and end to reflect the range of addresses that have not
	 * been affected.  We keep looping until done.
	 */
	do {
		rc = _clr_address_flags(mm, &args->start, &args->end, 
			args->flags);
	} while(rc > 0);
}


/* it is expected that the caller holds mm->mmap_sem */

static mfte_t * va_to_mftep(struct mm_struct *mm, unsigned int addr)
{
	mfdep_t *mfdp;
	mfdep_t mfdep;
	mfte_t *mftep = 0;

	mfdp = memmon_flag_dir(mm);
	if (!*mfdp) goto out;
		
	/* We want to get a pointer to the appropriate entry
	 * in the L1 table.
	 */
	mfdep = memmon_flag_dir_entry(mfdp, addr);
	if (!*mfdep) goto out;

	/* Now we want to find the right entry in the L2 table */
	mftep = memmon_flag_table_entry(mfdep, addr);

out:
	return mftep;
}

/* Page table walking code stolen from follow_page() except
 * that this version does not support huge tlbs.
 */
static  pte_t *va_to_ptep_map(struct mm_struct *mm, unsigned long addr)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *ptep = 0;		
		
	pgd = pgd_offset(mm, addr);
	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto out;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud) || unlikely(pud_bad(*pud))) goto out;

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out;

	ptep = pte_offset_map(pmd, addr);
	if (!ptep) goto out;

out:
	return ptep;
}

int memmon_collect_data(struct mm_struct *mm, struct memmon_info *args)
{
	int ret;
	unsigned int start = args->start;
	unsigned int end = args->end;
	unsigned long bufaddr = args->buf;
	unsigned int __user *buf = (unsigned int __user *) bufaddr;
	int options = args->options;
	unsigned int entries = 0;

	unsigned int addr;
	unsigned int page_count=0;
	unsigned int loop_count=0;
#ifdef BATCH_FLUSH
	int need_flush=0;
#endif
	ret = -EFAULT;

	if (options & STORE_MATCHING_PAGES) {
		ret = !access_ok(VERIFY_WRITE, buf, args->bufsize);
		if (ret) {
			printk("verification failed\n");
			ret = -EFAULT;
			goto out;
		}
		entries =  args->bufsize / sizeof(*buf);
	}

	down_write(&mm->mmap_sem);

	/* scan through the entire address space given */
	page_count = 0;

	/* Must hold the page table spinlock while walking */
	spin_lock(&mm->page_table_lock);

	for (addr=start&PAGE_MASK; addr<=end; addr+=PAGE_SIZE) {
		/* Don't remove the initialization of ptep. It's
		 * there to handle unmap_continue, below.
		 */
		pte_t *ptep=0;
		int need_clean = (options & CLEAN_MATCHING_PAGES);

		/* Periodically drop the lock to allow preemption.  Testing
		 * on the G5 gives lock-hold times of about 500usec with
		 * negligible effects on overall performance.
		 */
		if (++loop_count == 1000) {
#ifdef BATCH_FLUSH
			if (need_flush) {
				need_flush = 0;
				flush_tlb_pending();
			}
#endif
			spin_unlock(&mm->page_table_lock);
			spin_lock(&mm->page_table_lock);
			loop_count = 0;
		}

		/* If the option is specified, test the specified flags
		 * against the flags stored for this page.
		 */ 
		if (options & CHECK_FLAGS) {
			mfte_t *flagp = va_to_mftep(mm, addr);
			if (!flagp)
				goto unmap_continue;

			if (!memmon_test_flags(flagp, args->flags))
				goto unmap_continue;
		}

#ifdef CONFIG_MEMMON_SWAP_SUPPORT
		/* Check if the page has been dirtied or swapped out.  These two
		 * states are always mutually exclusive.  When both options
		 * are specified we count the page if either the page is dirty
		 * or swapped.
		 */
		if (options & (CHECK_SWAPPED_STATE | CHECK_DIRTY_STATE)) {
			int match = 0;

			ptep = va_to_ptep_map(mm, addr);
			if (!ptep) goto unmap_continue;

			if (pte_present(*ptep) && (options & CHECK_DIRTY_STATE)) {
				match = pte_dirty(*ptep);
				if (!match) match |= PageDirty(pte_page(*ptep));
				if (!match) match |= PageSwapCache(pte_page(*ptep));
			} else if (!pte_present(*ptep) && (options & CHECK_SWAPPED_STATE)) {
				match = !pte_none(*ptep);
			}

			if (!match) goto unmap_continue;
		}
#else
		/* Check if the page has been dirtied. */
		if (options & CHECK_DIRTY_STATE) {
			ptep = va_to_ptep_map(mm, addr);

			if (!ptep || !pte_present(*ptep) || !pte_dirty(*ptep)) {
				goto unmap_continue;
			}
		}
#endif

		/* The page passed all requested checks. Count it. This must
		 * happen before the comparison against "entries" below.
		 */
		page_count++;

		/* See if we want to store the page address. */
		if (page_count <= entries) {
			__put_user(addr, buf);
			buf++;

			/* Handle option to stop early. */
			if ((page_count == entries) &&
				(options & STOP_WHEN_BUF_FULL))
				addr=end+1;

			/* Handle option to clean the page. */
			if (options & CLEAN_STORED_PAGES)
				need_clean=1;
		}

		/* WARNING: A call to this function does not cause the contents 
		 * of page to be swapped to disk.  It simply clears the "dirty"
		 * flag and invalidates the TLB.  Calling this function is a 
		 * great way to lose data during swapping.
		 */
		 if (need_clean) {
#ifdef BATCH_FLUSH
			/* On some architectures (currently only ppc64) we can
			 * do all of the test/clear individually here, then
			 * flush them all at once.  This saves about 20% of the
			 * cost of a 10000 dirty-page scan over 1GB of memory.
			 */
			ptep_test_and_clear_dirty(find_vma(mm, addr), addr, ptep);
			need_flush = 1;
#else
		 	ptep_clear_flush_dirty(find_vma(mm, addr), addr, ptep);
#endif
		}

unmap_continue:
		if (ptep)
			pte_unmap(ptep);
	}
	spin_unlock(&mm->page_table_lock);
	ret = page_count;

	up_write(&mm->mmap_sem);
out:
#ifdef BATCH_FLUSH
	if (need_flush)
		flush_tlb_pending();
#endif
	return ret;
}

