HEX

File: //usr/src/sysdig-0.27.1/main.c
/*

Copyright (c) 2013-2018 Draios Inc. dba Sysdig.

This file is dual licensed under either the MIT or GPL 2. See MIT.txt
or GPL2.txt for full copies of the license.

*/

#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt

#include <linux/version.h>
#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 20)
#include <linux/kobject.h>
#include <trace/sched.h>
#include "ppm_syscall.h"
#include <trace/syscall.h>
#else
#include <asm/syscall.h>
#endif
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 37))
#include <asm/atomic.h>
#else
#include <linux/atomic.h>
#endif
#include <linux/cdev.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/kdev_t.h>
#include <linux/delay.h>
#include <linux/proc_fs.h>
#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0))
#include <linux/sched.h>
#else
#include <linux/sched/signal.h>
#include <linux/sched/cputime.h>
#endif
#include <linux/vmalloc.h>
#include <linux/wait.h>
#include <linux/tracepoint.h>
#include <linux/cpu.h>
#include <linux/jiffies.h>
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 26))
#include <linux/file.h>
#else
#include <linux/fdtable.h>
#endif
#include <net/sock.h>
#include <asm/unistd.h>

#include "driver_config.h"
#include "ppm_ringbuffer.h"
#include "ppm_events_public.h"
#include "ppm_events.h"
#include "ppm.h"
#if defined(CONFIG_IA32_EMULATION) && !defined(__NR_ia32_socketcall)
#include "ppm_compat_unistd_32.h"
#endif

MODULE_LICENSE("GPL");
MODULE_AUTHOR("sysdig inc");

#if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 35))
    #define TRACEPOINT_PROBE_REGISTER(p1, p2) tracepoint_probe_register(p1, p2)
    #define TRACEPOINT_PROBE_UNREGISTER(p1, p2) tracepoint_probe_unregister(p1, p2)
    #define TRACEPOINT_PROBE(probe, args...) static void probe(args)
#else
    #define TRACEPOINT_PROBE_REGISTER(p1, p2) tracepoint_probe_register(p1, p2, NULL)
    #define TRACEPOINT_PROBE_UNREGISTER(p1, p2) tracepoint_probe_unregister(p1, p2, NULL)
    #define TRACEPOINT_PROBE(probe, args...) static void probe(void *__data, args)
#endif

// Allow build even on arch where PAGE_ENC is not implemented
#ifndef _PAGE_ENC
#define _PAGE_ENC 0
#endif

struct ppm_device {
	dev_t dev;
	struct cdev cdev;
	wait_queue_head_t read_queue;
};

struct event_data_t {
	enum ppm_capture_category category;
	int socketcall_syscall;
	bool compat;

	union {
		struct {
			struct pt_regs *regs;
			long id;
			const enum ppm_syscall_code *cur_g_syscall_code_routing_table;
		} syscall_data;

		struct {
			struct task_struct *sched_prev;
			struct task_struct *sched_next;
		} context_data;

		struct {
			int sig;
			struct siginfo *info;
			struct k_sigaction *ka;
		} signal_data;

		struct fault_data_t fault_data;
	} event_info;
};

/*
 * FORWARD DECLARATIONS
 */
static int ppm_open(struct inode *inode, struct file *filp);
static int ppm_release(struct inode *inode, struct file *filp);
static long ppm_ioctl(struct file *f, unsigned int cmd, unsigned long arg);
static int ppm_mmap(struct file *filp, struct vm_area_struct *vma);
static int record_event_consumer(struct ppm_consumer_t *consumer,
                                 enum ppm_event_type event_type,
                                 enum syscall_flags drop_flags,
                                 nanoseconds ns,
                                 struct event_data_t *event_datap);
static void record_event_all_consumers(enum ppm_event_type event_type,
                                       enum syscall_flags drop_flags,
                                       struct event_data_t *event_datap);
static int init_ring_buffer(struct ppm_ring_buffer_context *ring);
static void free_ring_buffer(struct ppm_ring_buffer_context *ring);
static void reset_ring_buffer(struct ppm_ring_buffer_context *ring);
#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0))
void ppm_task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
#endif

#ifndef CONFIG_HAVE_SYSCALL_TRACEPOINTS
 #error The kernel must have HAVE_SYSCALL_TRACEPOINTS in order for sysdig to be useful
#endif

TRACEPOINT_PROBE(syscall_enter_probe, struct pt_regs *regs, long id);
TRACEPOINT_PROBE(syscall_exit_probe, struct pt_regs *regs, long ret);
TRACEPOINT_PROBE(syscall_procexit_probe, struct task_struct *p);
#ifdef CAPTURE_CONTEXT_SWITCHES
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 35))
TRACEPOINT_PROBE(sched_switch_probe, struct rq *rq, struct task_struct *prev, struct task_struct *next);
#elif (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0))
TRACEPOINT_PROBE(sched_switch_probe, struct task_struct *prev, struct task_struct *next);
#else
TRACEPOINT_PROBE(sched_switch_probe, bool preempt, struct task_struct *prev, struct task_struct *next);
#endif /* (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35)) */
#endif /* CAPTURE_CONTEXT_SWITCHES */

#ifdef CAPTURE_SIGNAL_DELIVERIES
TRACEPOINT_PROBE(signal_deliver_probe, int sig, struct siginfo *info, struct k_sigaction *ka);
#endif

#ifdef CAPTURE_PAGE_FAULTS
TRACEPOINT_PROBE(page_fault_probe, unsigned long address, struct pt_regs *regs, unsigned long error_code);
#endif

DECLARE_BITMAP(g_events_mask, PPM_EVENT_MAX);
static struct ppm_device *g_ppm_devs;
static struct class *g_ppm_class;
static unsigned int g_ppm_numdevs;
static int g_ppm_major;
bool g_tracers_enabled = false;
bool g_simple_mode_enabled = false;
static DEFINE_PER_CPU(long, g_n_tracepoint_hit);
static const struct file_operations g_ppm_fops = {
	.open = ppm_open,
	.release = ppm_release,
	.mmap = ppm_mmap,
	.unlocked_ioctl = ppm_ioctl,
	.owner = THIS_MODULE,
};

/*
 * GLOBALS
 */
LIST_HEAD(g_consumer_list);
static DEFINE_MUTEX(g_consumer_mutex);
static bool g_tracepoint_registered;

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
static struct tracepoint *tp_sys_enter;
static struct tracepoint *tp_sys_exit;
#endif

static struct tracepoint *tp_sched_process_exit;
#ifdef CAPTURE_CONTEXT_SWITCHES
static struct tracepoint *tp_sched_switch;
#endif
#ifdef CAPTURE_SIGNAL_DELIVERIES
static struct tracepoint *tp_signal_deliver;
#endif
#ifdef CAPTURE_PAGE_FAULTS
// Even in kernels that can support page fault tracepoints, tracepoints may be
// disabled so check if g_fault_tracepoint_disabled is set.
static struct tracepoint *tp_page_fault_user;
static struct tracepoint *tp_page_fault_kernel;
static bool g_fault_tracepoint_registered;
static bool g_fault_tracepoint_disabled;
#endif

#ifdef _DEBUG
static bool verbose = 1;
#else
static bool verbose = 0;
#endif

static unsigned int max_consumers = 5;

#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0))
static enum cpuhp_state hp_state = 0;
#endif

#define vpr_info(fmt, ...)					\
do {								\
	if (verbose)						\
		pr_info(fmt, ##__VA_ARGS__);			\
} while (0)

static inline nanoseconds ppm_nsecs(void)
{
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0))
	return ktime_get_real_ns();
#else
	/* Don't have ktime_get_real functions */
	struct timespec ts;
	getnstimeofday(&ts);
	return SECOND_IN_NS * ts.tv_sec + ts.tv_nsec;
#endif
}

inline void ppm_syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args)
{
#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 1, 0))
    syscall_get_arguments(task, regs, 0, 6, args);
#else
    syscall_get_arguments(task, regs, args);
#endif
}

/* compat tracepoint functions */
static int compat_register_trace(void *func, const char *probename, struct tracepoint *tp)
{
#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
	return TRACEPOINT_PROBE_REGISTER(probename, func);
#else
	return tracepoint_probe_register(tp, func, NULL);
#endif
}

static void compat_unregister_trace(void *func, const char *probename, struct tracepoint *tp)
{
#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
	TRACEPOINT_PROBE_UNREGISTER(probename, func);
#else
	tracepoint_probe_unregister(tp, func, NULL);
#endif
}

static struct ppm_consumer_t *ppm_find_consumer(struct task_struct *consumer_id)
{
	struct ppm_consumer_t *el = NULL;

	rcu_read_lock();
	list_for_each_entry_rcu(el, &g_consumer_list, node) {
		if (el->consumer_id == consumer_id) {
			rcu_read_unlock();
			return el;
		}
	}
	rcu_read_unlock();

	return NULL;
}

static void check_remove_consumer(struct ppm_consumer_t *consumer, int remove_from_list)
{
	int cpu;
	int open_rings = 0;

	for_each_possible_cpu(cpu) {
		struct ppm_ring_buffer_context *ring = per_cpu_ptr(consumer->ring_buffers, cpu);

		if (ring && ring->open)
			++open_rings;
	}

	if (open_rings == 0) {
		pr_info("deallocating consumer %p\n", consumer->consumer_id);

		if (remove_from_list) {
			list_del_rcu(&consumer->node);
			synchronize_rcu();
		}

		for_each_possible_cpu(cpu) {
			struct ppm_ring_buffer_context *ring = per_cpu_ptr(consumer->ring_buffers, cpu);
			free_ring_buffer(ring);
		}

		free_percpu(consumer->ring_buffers);

		vfree(consumer);
	}
}

/*
 * user I/O functions
 */
static int ppm_open(struct inode *inode, struct file *filp)
{
	int ret;
	int in_list = false;
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
	int ring_no = iminor(filp->f_path.dentry->d_inode);
#else
	int ring_no = iminor(filp->f_dentry->d_inode);
#endif
	struct task_struct *consumer_id = current;
	struct ppm_consumer_t *consumer = NULL;
	struct ppm_ring_buffer_context *ring = NULL;

	/*
	 * Tricky: to identify a consumer, attach the thread id
	 * to the newly open file descriptor
	 */
	filp->private_data = consumer_id;

	mutex_lock(&g_consumer_mutex);

	consumer = ppm_find_consumer(consumer_id);
	if (!consumer) {
		unsigned int cpu;
		unsigned int num_consumers = 0;
		struct ppm_consumer_t *el = NULL;

		rcu_read_lock();
		list_for_each_entry_rcu(el, &g_consumer_list, node) {
			++num_consumers;
		}
		rcu_read_unlock();

		if (num_consumers >= max_consumers) {
			pr_err("maximum number of consumers reached\n");
			ret = -EBUSY;
			goto cleanup_open;
		}

		pr_info("adding new consumer %p\n", consumer_id);

		consumer = vmalloc(sizeof(struct ppm_consumer_t));
		if (!consumer) {
			pr_err("can't allocate consumer\n");
			ret = -ENOMEM;
			goto cleanup_open;
		}

		consumer->consumer_id = consumer_id;

		/*
		 * Initialize the ring buffers array
		 */
		consumer->ring_buffers = alloc_percpu(struct ppm_ring_buffer_context);
		if (consumer->ring_buffers == NULL) {
			pr_err("can't allocate the ring buffer array\n");

			vfree(consumer);

			ret = -ENOMEM;
			goto cleanup_open;
		}

		/*
		 * Note, we have two loops here because the first one makes sure that ALL of the
		 * rings are properly initialized to null, since the second one could be interrupted
		 * and cause issues in the cleanup phase.
		 * This might not be necessary, because alloc_percpu memsets the allocated entries to
		 * 0, but better be extra safe.
		 */
		for_each_possible_cpu(cpu) {
			ring = per_cpu_ptr(consumer->ring_buffers, cpu);

			ring->cpu_online = false;
			ring->str_storage = NULL;
			ring->buffer = NULL;
			ring->info = NULL;
		}

		/*
		 * If a cpu is offline when the consumer is first created, we
		 * will never get events for that cpu even if it later comes
		 * online via hotplug. We could allocate these rings on-demand
		 * later in this function if needed for hotplug, but that
		 * requires the consumer to know to call open again, and sysdig
		 * doesn't support that.
		 */
		for_each_online_cpu(cpu) {
			ring = per_cpu_ptr(consumer->ring_buffers, cpu);

			pr_info("initializing ring buffer for CPU %u\n", cpu);

			if (!init_ring_buffer(ring)) {
				pr_err("can't initialize the ring buffer for CPU %u\n", cpu);
				ret = -ENOMEM;
				goto err_init_ring_buffer;
			}

			ring->cpu_online = true;
		}

		list_add_rcu(&consumer->node, &g_consumer_list);
		in_list = true;
	} else {
		vpr_info("found already existent consumer %p\n", consumer_id);
	}

	ring = per_cpu_ptr(consumer->ring_buffers, ring_no);

	/*
	 * Check if the CPU pointed by this device is online. If it isn't stop here and
	 * return ENODEV. The cpu could be online while buffer is NULL if there's a cpu
	 * online hotplug callback between the first open on this consumer and the open
	 * for this particular device.
	 */
	if (ring->cpu_online == false || ring->buffer == NULL) {
		ret = -ENODEV;
		goto cleanup_open;
	}

	if (ring->open) {
		pr_err("invalid operation: attempting to open device %d multiple times for consumer %p\n", ring_no, consumer->consumer_id);
		ret = -EBUSY;
		goto cleanup_open;
	}

	vpr_info("opening ring %d, consumer %p\n", ring_no, consumer->consumer_id);

	/*
	 * ring->preempt_count is not reset to 0 on purpose, to prevent a race condition:
	 * if the same device is quickly closed and then reopened, record_event() might still be executing
	 * (with ring->preempt_count to 1) while ppm_open() resets ring->preempt_count to 0.
	 * When record_event() will exit, it will decrease
	 * ring->preempt_count which will become < 0, leading to the complete loss of all the events for that CPU.
	 */
	consumer->dropping_mode = 0;
	consumer->snaplen = RW_SNAPLEN;
	consumer->sampling_ratio = 1;
	consumer->sampling_interval = 0;
	consumer->is_dropping = 0;
	consumer->do_dynamic_snaplen = false;
	consumer->need_to_insert_drop_e = 0;
	consumer->need_to_insert_drop_x = 0;
	consumer->fullcapture_port_range_start = 0;
	consumer->fullcapture_port_range_end = 0;
	consumer->statsd_port = PPM_PORT_STATSD;
	bitmap_fill(g_events_mask, PPM_EVENT_MAX); /* Enable all syscall to be passed to userspace */
	reset_ring_buffer(ring);
	ring->open = true;

	if (!g_tracepoint_registered) {
		pr_info("starting capture\n");
		/*
		 * Enable the tracepoints
		 */

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
		ret = compat_register_trace(syscall_exit_probe, "sys_exit", tp_sys_exit);
#else
		ret = register_trace_syscall_exit(syscall_exit_probe);
#endif
		if (ret) {
			pr_err("can't create the sys_exit tracepoint\n");
			goto err_sys_exit;
		}

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
		ret = compat_register_trace(syscall_enter_probe, "sys_enter", tp_sys_enter);
#else
		ret = register_trace_syscall_enter(syscall_enter_probe);
#endif
		if (ret) {
			pr_err("can't create the sys_enter tracepoint\n");
			goto err_sys_enter;
		}

		ret = compat_register_trace(syscall_procexit_probe, "sched_process_exit", tp_sched_process_exit);
		if (ret) {
			pr_err("can't create the sched_process_exit tracepoint\n");
			goto err_sched_procexit;
		}

#ifdef CAPTURE_CONTEXT_SWITCHES
		ret = compat_register_trace(sched_switch_probe, "sched_switch", tp_sched_switch);
		if (ret) {
			pr_err("can't create the sched_switch tracepoint\n");
			goto err_sched_switch;
		}
#endif

#ifdef CAPTURE_SIGNAL_DELIVERIES
		ret = compat_register_trace(signal_deliver_probe, "signal_deliver", tp_signal_deliver);
		if (ret) {
			pr_err("can't create the signal_deliver tracepoint\n");
			goto err_signal_deliver;
		}
#endif
		g_tracepoint_registered = true;
	}

	ret = 0;

	goto cleanup_open;

#ifdef CAPTURE_SIGNAL_DELIVERIES
err_signal_deliver:
	compat_unregister_trace(sched_switch_probe, "sched_switch", tp_sched_switch);
#endif
err_sched_switch:
	compat_unregister_trace(syscall_procexit_probe, "sched_process_exit", tp_sched_process_exit);
err_sched_procexit:
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
	compat_unregister_trace(syscall_enter_probe, "sys_enter", tp_sys_enter);
#else
	unregister_trace_syscall_enter(syscall_enter_probe);
#endif
err_sys_enter:
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
	compat_unregister_trace(syscall_exit_probe, "sys_exit", tp_sys_exit);
#else
	unregister_trace_syscall_exit(syscall_exit_probe);
#endif
err_sys_exit:
	ring->open = false;
err_init_ring_buffer:
	check_remove_consumer(consumer, in_list);
cleanup_open:
	mutex_unlock(&g_consumer_mutex);

	return ret;
}

static int ppm_release(struct inode *inode, struct file *filp)
{
	int cpu;
	int ret;
	struct ppm_ring_buffer_context *ring;
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
	int ring_no = iminor(filp->f_path.dentry->d_inode);
#else
	int ring_no = iminor(filp->f_dentry->d_inode);
#endif
	struct task_struct *consumer_id = filp->private_data;
	struct ppm_consumer_t *consumer = NULL;

	mutex_lock(&g_consumer_mutex);

	consumer = ppm_find_consumer(consumer_id);
	if (!consumer) {
		pr_err("release: unknown consumer %p\n", consumer_id);
		ret = -EBUSY;
		goto cleanup_release;
	}

	ring = per_cpu_ptr(consumer->ring_buffers, ring_no);
	if (!ring) {
		ASSERT(false);
		ret = -ENODEV;
		goto cleanup_release;
	}

	if (!ring->open) {
		pr_err("attempting to close unopened device %d for consumer %p\n", ring_no, consumer_id);
		ret = -EBUSY;
		goto cleanup_release;
	}

	ring->capture_enabled = false;

	vpr_info("closing ring %d, consumer:%p evt:%llu, dr_buf:%llu, dr_pf:%llu, pr:%llu, cs:%llu\n",
	       ring_no,
	       consumer_id,
	       ring->info->n_evts,
	       ring->info->n_drops_buffer,
	       ring->info->n_drops_pf,
	       ring->info->n_preemptions,
	       ring->info->n_context_switches);

	ring->open = false;

	check_remove_consumer(consumer, true);

	/*
	 * The last closed device stops event collection
	 */
	if (list_empty(&g_consumer_list)) {
		if (g_tracepoint_registered) {
			pr_info("no more consumers, stopping capture\n");

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
			compat_unregister_trace(syscall_exit_probe, "sys_exit", tp_sys_exit);
			compat_unregister_trace(syscall_enter_probe, "sys_enter", tp_sys_enter);
#else
			unregister_trace_syscall_exit(syscall_exit_probe);
			unregister_trace_syscall_enter(syscall_enter_probe);
#endif
			compat_unregister_trace(syscall_procexit_probe, "sched_process_exit", tp_sched_process_exit);

#ifdef CAPTURE_CONTEXT_SWITCHES
			compat_unregister_trace(sched_switch_probe, "sched_switch", tp_sched_switch);
#endif
#ifdef CAPTURE_SIGNAL_DELIVERIES
			compat_unregister_trace(signal_deliver_probe, "signal_deliver", tp_signal_deliver);
#endif
#ifdef CAPTURE_PAGE_FAULTS
			if (g_fault_tracepoint_registered) {
				compat_unregister_trace(page_fault_probe, "page_fault_user", tp_page_fault_user);
				compat_unregister_trace(page_fault_probe, "page_fault_kernel", tp_page_fault_kernel);

				g_fault_tracepoint_registered = false;
			}
#endif
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
			tracepoint_synchronize_unregister();
#endif
			g_tracepoint_registered = false;

			/*
			 * While we're here, disable simple mode if it's active
			 */
			g_simple_mode_enabled = false;

			/*
			 * Reset tracepoint counter
			 */
			for_each_possible_cpu(cpu) {
				per_cpu(g_n_tracepoint_hit, cpu) = 0;
			}
		} else {
			ASSERT(false);
		}
	}

	ret = 0;

cleanup_release:
	mutex_unlock(&g_consumer_mutex);

	return ret;
}

static long ppm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
	int cpu;
	int ret;
	struct task_struct *consumer_id = filp->private_data;
	struct ppm_consumer_t *consumer = NULL;

	if (cmd == PPM_IOCTL_GET_PROCLIST) {
		struct ppm_proclist_info *proclist_info = NULL;
		struct task_struct *p, *t;
		u64 nentries = 0;
		struct ppm_proclist_info pli;
		u32 memsize;

		if (copy_from_user(&pli, (void *)arg, sizeof(pli))) {
			ret = -EINVAL;
			goto cleanup_ioctl_nolock;
		}

		if(pli.max_entries < 0 || pli.max_entries > 1000000)
		{
			vpr_info("PPM_IOCTL_GET_PROCLIST: invalid max_entries %llu\n", pli.max_entries);
			ret = -EINVAL;
			goto cleanup_ioctl_procinfo;
		}

		vpr_info("PPM_IOCTL_GET_PROCLIST, size=%d\n", (int)pli.max_entries);

		memsize = sizeof(struct ppm_proclist_info) + sizeof(struct ppm_proc_info) * pli.max_entries;
		proclist_info = vmalloc(memsize);
		if (!proclist_info) {
			ret = -EINVAL;
			goto cleanup_ioctl_nolock;
		}

		proclist_info->max_entries = pli.max_entries;

		rcu_read_lock();

#ifdef for_each_process_thread
		for_each_process_thread(p, t) {
#else
#ifdef for_each_process_all
		for_each_process_all(p) {
#else
		for_each_process(p) {
#endif
			t = p;
			do {
				task_lock(p);
#endif
				if (nentries < pli.max_entries) {
#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0))
					cputime_t utime, stime;
#else
					u64 utime, stime;
#endif

#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0))
					task_cputime_adjusted(t, &utime, &stime);
#else
					ppm_task_cputime_adjusted(t, &utime, &stime);
#endif
					proclist_info->entries[nentries].pid = t->pid;
#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0))
					proclist_info->entries[nentries].utime = cputime_to_clock_t(utime);
					proclist_info->entries[nentries].stime = cputime_to_clock_t(stime);
#else
					proclist_info->entries[nentries].utime = nsec_to_clock_t(utime);
					proclist_info->entries[nentries].stime = nsec_to_clock_t(stime);
#endif
				}

				nentries++;
#ifdef for_each_process_thread
		}
#else
				task_unlock(p);
#ifdef while_each_thread_all
			} while_each_thread_all(p, t);
		}
#else
			} while_each_thread(p, t);
		}
#endif
#endif

		rcu_read_unlock();

		proclist_info->n_entries = nentries;

		if (nentries >= pli.max_entries) {
			vpr_info("PPM_IOCTL_GET_PROCLIST: not enough space (%d avail, %d required)\n",
				(int)pli.max_entries,
				(int)nentries);

			if (copy_to_user((void *)arg, proclist_info, sizeof(struct ppm_proclist_info))) {
				ret = -EINVAL;
				goto cleanup_ioctl_procinfo;
			}

			ret = -ENOSPC;
			goto cleanup_ioctl_procinfo;
		} else {
			memsize = sizeof(struct ppm_proclist_info) + sizeof(struct ppm_proc_info) * nentries;

			if (copy_to_user((void *)arg, proclist_info, memsize)) {
				ret = -EINVAL;
				goto cleanup_ioctl_procinfo;
			}
		}

		ret = 0;
cleanup_ioctl_procinfo:
		vfree((void *)proclist_info);
		goto cleanup_ioctl_nolock;
	}

	if (cmd == PPM_IOCTL_GET_N_TRACEPOINT_HIT) {
		long __user *counters = (long __user *) arg;

		for_each_possible_cpu(cpu) {
			if (put_user(per_cpu(g_n_tracepoint_hit, cpu), &counters[cpu])) {
				ret = -EINVAL;
				goto cleanup_ioctl_nolock;
			}
		}
		ret = 0;
		goto cleanup_ioctl_nolock;
	} else if (cmd == PPM_IOCTL_GET_PROBE_VERSION) {
		if (copy_to_user((void *)arg, PROBE_VERSION, sizeof(PROBE_VERSION))) {
			ret = -EINVAL;
			goto cleanup_ioctl_nolock;
		}
		ret = 0;
		goto cleanup_ioctl_nolock;
	}

	mutex_lock(&g_consumer_mutex);

	consumer = ppm_find_consumer(consumer_id);
	if (!consumer) {
		pr_err("ioctl: unknown consumer %p\n", consumer_id);
		ret = -EBUSY;
		goto cleanup_ioctl;
	}

	switch (cmd) {
	case PPM_IOCTL_DISABLE_CAPTURE:
	{
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
		int ring_no = iminor(filp->f_path.dentry->d_inode);
#else
		int ring_no = iminor(filp->f_dentry->d_inode);
#endif
		struct ppm_ring_buffer_context *ring = per_cpu_ptr(consumer->ring_buffers, ring_no);

		if (!ring) {
			ASSERT(false);
			ret = -ENODEV;
			goto cleanup_ioctl;
		}

		ring->capture_enabled = false;

		vpr_info("PPM_IOCTL_DISABLE_CAPTURE for ring %d, consumer %p\n", ring_no, consumer_id);

		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_ENABLE_CAPTURE:
	{
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
		int ring_no = iminor(filp->f_path.dentry->d_inode);
#else
		int ring_no = iminor(filp->f_dentry->d_inode);
#endif
		struct ppm_ring_buffer_context *ring = per_cpu_ptr(consumer->ring_buffers, ring_no);

		if (!ring) {
			ASSERT(false);
			ret = -ENODEV;
			goto cleanup_ioctl;
		}

		ring->capture_enabled = true;

		vpr_info("PPM_IOCTL_ENABLE_CAPTURE for ring %d, consumer %p\n", ring_no, consumer_id);

		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_DISABLE_DROPPING_MODE:
	{
		struct event_data_t event_data;

		vpr_info("PPM_IOCTL_DISABLE_DROPPING_MODE, consumer %p\n", consumer_id);

		consumer->dropping_mode = 0;
		consumer->sampling_interval = 1000000000;
		consumer->sampling_ratio = 1;

		/*
		 * Push an event into the ring buffer so that the user can know that dropping
		 * mode has been disabled
		 */
		event_data.category = PPMC_CONTEXT_SWITCH;
		event_data.event_info.context_data.sched_prev = (void *)DEI_DISABLE_DROPPING;
		event_data.event_info.context_data.sched_next = (void *)0;

		record_event_consumer(consumer, PPME_SYSDIGEVENT_E, UF_NEVER_DROP, ppm_nsecs(), &event_data);

		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_ENABLE_DROPPING_MODE:
	{
		u32 new_sampling_ratio;

		consumer->dropping_mode = 1;
		vpr_info("PPM_IOCTL_ENABLE_DROPPING_MODE, consumer %p\n", consumer_id);

		new_sampling_ratio = (u32)arg;

		if (new_sampling_ratio != 1 &&
			new_sampling_ratio != 2 &&
			new_sampling_ratio != 4 &&
			new_sampling_ratio != 8 &&
			new_sampling_ratio != 16 &&
			new_sampling_ratio != 32 &&
			new_sampling_ratio != 64 &&
			new_sampling_ratio != 128) {
			pr_err("invalid sampling ratio %u\n", new_sampling_ratio);
			ret = -EINVAL;
			goto cleanup_ioctl;
		}

		consumer->sampling_interval = 1000000000 / new_sampling_ratio;
		consumer->sampling_ratio = new_sampling_ratio;

		vpr_info("new sampling ratio: %d\n", new_sampling_ratio);

		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_SET_SNAPLEN:
	{
		u32 new_snaplen;

		vpr_info("PPM_IOCTL_SET_SNAPLEN, consumer %p\n", consumer_id);
		new_snaplen = (u32)arg;

		if (new_snaplen > RW_MAX_SNAPLEN) {
			pr_err("invalid snaplen %u\n", new_snaplen);
			ret = -EINVAL;
			goto cleanup_ioctl;
		}

		consumer->snaplen = new_snaplen;

		vpr_info("new snaplen: %d\n", consumer->snaplen);

		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_SET_FULLCAPTURE_PORT_RANGE:
	{
		u32 encoded_port_range;

		vpr_info("PPM_IOCTL_SET_FULLCAPTURE_PORT_RANGE, consumer %p\n", consumer_id);
		encoded_port_range = (u32)arg;

		consumer->fullcapture_port_range_start = encoded_port_range & 0xFFFF;
		consumer->fullcapture_port_range_end = encoded_port_range >> 16;

		pr_info("new fullcapture_port_range_start: %d\n", (int)consumer->fullcapture_port_range_start);
		pr_info("new fullcapture_port_range_end: %d\n", (int)consumer->fullcapture_port_range_end);

		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_SET_STATSD_PORT:
	{
		consumer->statsd_port = (u16)arg;

		pr_info("new statsd_port: %d\n", (int)consumer->statsd_port);

		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_MASK_ZERO_EVENTS:
	{
		vpr_info("PPM_IOCTL_MASK_ZERO_EVENTS, consumer %p\n", consumer_id);

		bitmap_zero(g_events_mask, PPM_EVENT_MAX);

		/* Used for dropping events so they must stay on */
		set_bit(PPME_DROP_E, g_events_mask);
		set_bit(PPME_DROP_X, g_events_mask);

		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_MASK_SET_EVENT:
	{
		u32 syscall_to_set = (u32)arg;

		vpr_info("PPM_IOCTL_MASK_SET_EVENT (%u), consumer %p\n", syscall_to_set, consumer_id);

		if (syscall_to_set >= PPM_EVENT_MAX) {
			pr_err("invalid syscall %u\n", syscall_to_set);
			ret = -EINVAL;
			goto cleanup_ioctl;
		}

		set_bit(syscall_to_set, g_events_mask);

		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_MASK_UNSET_EVENT:
	{
		u32 syscall_to_unset = (u32)arg;

		vpr_info("PPM_IOCTL_MASK_UNSET_EVENT (%u), consumer %p\n", syscall_to_unset, consumer_id);

		if (syscall_to_unset >= PPM_EVENT_MAX) {
			pr_err("invalid syscall %u\n", syscall_to_unset);
			ret = -EINVAL;
			goto cleanup_ioctl;
		}

		clear_bit(syscall_to_unset, g_events_mask);

		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_DISABLE_DYNAMIC_SNAPLEN:
	{
		consumer->do_dynamic_snaplen = false;

		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_ENABLE_DYNAMIC_SNAPLEN:
	{
		consumer->do_dynamic_snaplen = true;

		ret = 0;
		goto cleanup_ioctl;
	}
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
	case PPM_IOCTL_GET_VTID:
	case PPM_IOCTL_GET_VPID:
	{
		pid_t vid;
		struct pid *pid;
		struct task_struct *task;
		struct pid_namespace *ns;

		rcu_read_lock();
		pid = find_pid_ns(arg, &init_pid_ns);
		if (!pid) {
			rcu_read_unlock();
			ret = -EINVAL;
			goto cleanup_ioctl;
		}

		task = pid_task(pid, PIDTYPE_PID);
		if (!task) {
			rcu_read_unlock();
			ret = -EINVAL;
			goto cleanup_ioctl;
		}

		ns = ns_of_pid(pid);
		if (!pid) {
			rcu_read_unlock();
			ret = -EINVAL;
			goto cleanup_ioctl;
		}

		if (cmd == PPM_IOCTL_GET_VTID)
			vid = task_pid_nr_ns(task, ns);
		else
			vid = task_tgid_nr_ns(task, ns);

		rcu_read_unlock();
		ret = vid;
		goto cleanup_ioctl;
	}
#endif
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
	case PPM_IOCTL_GET_CURRENT_TID:
		ret = task_pid_nr(current);
		goto cleanup_ioctl;
	case PPM_IOCTL_GET_CURRENT_PID:
		ret = task_tgid_nr(current);
		goto cleanup_ioctl;
#endif /* LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20) */
#ifdef CAPTURE_SIGNAL_DELIVERIES
	case PPM_IOCTL_DISABLE_SIGNAL_DELIVER:
	{
		vpr_info("PPM_IOCTL_DISABLE_SIGNAL_DELIVER\n");
		if (g_tracepoint_registered)
			compat_unregister_trace(signal_deliver_probe, "signal_deliver", tp_signal_deliver);
		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_ENABLE_SIGNAL_DELIVER:
	{
		vpr_info("PPM_IOCTL_ENABLE_SIGNAL_DELIVER\n");
		if (g_tracepoint_registered)
			compat_register_trace(signal_deliver_probe, "signal_deliver", tp_signal_deliver);
		ret = 0;
		goto cleanup_ioctl;
	}
#endif
	case PPM_IOCTL_SET_TRACERS_CAPTURE:
	{
		vpr_info("PPM_IOCTL_SET_TRACERS_CAPTURE, consumer %p\n", consumer_id);
		g_tracers_enabled = true;
		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_SET_SIMPLE_MODE:
	{
		vpr_info("PPM_IOCTL_SET_SIMPLE_MODE, consumer %p\n", consumer_id);
		g_simple_mode_enabled = true;
		ret = 0;
		goto cleanup_ioctl;
	}
	case PPM_IOCTL_ENABLE_PAGE_FAULTS:
	{
		vpr_info("PPM_IOCTL_ENABLE_PAGE_FAULTS\n");
#ifdef CAPTURE_PAGE_FAULTS
		ASSERT(g_tracepoint_registered);

		if (g_fault_tracepoint_disabled) {
			pr_err("kernel page fault tracepoints are disabled\n");
			ret = -EPERM;
			goto cleanup_ioctl;
		}

		if (!g_fault_tracepoint_registered) {
			ret = compat_register_trace(page_fault_probe, "page_fault_user", tp_page_fault_user);
			if (ret) {
				pr_err("can't create the page_fault_user tracepoint\n");
				ret = -EINVAL;
				goto cleanup_ioctl;
			}

			ret = compat_register_trace(page_fault_probe, "page_fault_kernel", tp_page_fault_kernel);
			if (ret) {
				pr_err("can't create the page_fault_kernel tracepoint\n");
				ret = -EINVAL;
				goto err_page_fault_kernel;
			}

			g_fault_tracepoint_registered = true;
		}

		ret = 0;
		goto cleanup_ioctl;
#else
		pr_err("kernel doesn't support page fault tracepoints\n");
		ret = -EINVAL;
		goto cleanup_ioctl;
#endif
	}
	default:
		ret = -ENOTTY;
		goto cleanup_ioctl;
	}

#ifdef CAPTURE_PAGE_FAULTS
err_page_fault_kernel:
	compat_unregister_trace(page_fault_probe, "page_fault_user", tp_page_fault_user);
#endif
cleanup_ioctl:
	mutex_unlock(&g_consumer_mutex);
cleanup_ioctl_nolock:
	return ret;
}

static int ppm_mmap(struct file *filp, struct vm_area_struct *vma)
{
	int ret;
	struct task_struct *consumer_id = filp->private_data;
	struct ppm_consumer_t *consumer = NULL;

	mutex_lock(&g_consumer_mutex);

	consumer = ppm_find_consumer(consumer_id);
	if (!consumer) {
		pr_err("mmap: unknown consumer %p\n", consumer_id);
		ret = -EIO;
		goto cleanup_mmap;
	}

	if (vma->vm_pgoff == 0) {
		long length = vma->vm_end - vma->vm_start;
		unsigned long useraddr = vma->vm_start;
		unsigned long pfn;
		char *vmalloc_area_ptr;
		char *orig_vmalloc_area_ptr;
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
		int ring_no = iminor(filp->f_path.dentry->d_inode);
#else
		int ring_no = iminor(filp->f_dentry->d_inode);
#endif
		struct ppm_ring_buffer_context *ring;

		vpr_info("mmap for consumer %p, CPU %d, start=%lu len=%ld page_size=%lu\n",
			   consumer_id,
		       ring_no,
		       useraddr,
		       length,
		       PAGE_SIZE);

		/*
		 * Enforce ring buffer size
		 */
		if (RING_BUF_SIZE < 2 * PAGE_SIZE) {
			pr_err("Ring buffer size too small (%ld bytes, must be at least %ld bytes\n",
			       (long)RING_BUF_SIZE,
			       (long)PAGE_SIZE);
			ret = -EIO;
			goto cleanup_mmap;
		}

		if (RING_BUF_SIZE / PAGE_SIZE * PAGE_SIZE != RING_BUF_SIZE) {
			pr_err("Ring buffer size is not a multiple of the page size\n");
			ret = -EIO;
			goto cleanup_mmap;
		}

		/*
		 * Retrieve the ring structure for this CPU
		 */
		ring = per_cpu_ptr(consumer->ring_buffers, ring_no);
		if (!ring) {
			ASSERT(false);
			ret = -ENODEV;
			goto cleanup_mmap;
		}

		if (length <= PAGE_SIZE) {
			/*
			 * When the size requested by the user is smaller than a page, we assume
			 * she's mapping the ring info structure
			 */
			vpr_info("mapping the ring info\n");

			vmalloc_area_ptr = (char *)ring->info;
			orig_vmalloc_area_ptr = vmalloc_area_ptr;

			pfn = vmalloc_to_pfn(vmalloc_area_ptr);

			pgprot_val(vma->vm_page_prot) = pgprot_val(PAGE_SHARED) | _PAGE_ENC;
			ret = remap_pfn_range(vma, useraddr, pfn,
					      PAGE_SIZE, vma->vm_page_prot);
			if (ret < 0) {
				pr_err("remap_pfn_range failed (1)\n");
				goto cleanup_mmap;
			}

			ret = 0;
			goto cleanup_mmap;
		} else if (length == RING_BUF_SIZE * 2) {
			long mlength;

			/*
			 * When the size requested by the user equals the ring buffer size, we map the full
			 * buffer
			 */
			vpr_info("mapping the data buffer\n");

			vmalloc_area_ptr = (char *)ring->buffer;
			orig_vmalloc_area_ptr = vmalloc_area_ptr;

			/*
			 * Validate that the buffer access is read only
			 */
			if (vma->vm_flags & VM_WRITE) {
				pr_err("invalid mmap flags 0x%lx\n", vma->vm_flags);
				ret = -EIO;
				goto cleanup_mmap;
			}

			/*
			 * Map each single page of the buffer
			 */
			mlength = length / 2;

			while (mlength > 0) {
				pfn = vmalloc_to_pfn(vmalloc_area_ptr);

				pgprot_val(vma->vm_page_prot) = pgprot_val(PAGE_SHARED) | _PAGE_ENC;
				ret = remap_pfn_range(vma, useraddr, pfn,
						      PAGE_SIZE, vma->vm_page_prot);
				if (ret < 0) {
					pr_err("remap_pfn_range failed (1)\n");
					goto cleanup_mmap;
				}

				useraddr += PAGE_SIZE;
				vmalloc_area_ptr += PAGE_SIZE;
				mlength -= PAGE_SIZE;
			}

			/*
			 * Remap a second copy of the buffer pages at the end of the buffer.
			 * This effectively mirrors the buffer at its end and helps simplify buffer management in userland.
			 */
			vmalloc_area_ptr = orig_vmalloc_area_ptr;
			mlength = length / 2;

			while (mlength > 0) {
				pfn = vmalloc_to_pfn(vmalloc_area_ptr);

				pgprot_val(vma->vm_page_prot) = pgprot_val(PAGE_SHARED) | _PAGE_ENC;
				ret = remap_pfn_range(vma, useraddr, pfn,
						      PAGE_SIZE, vma->vm_page_prot);
				if (ret < 0) {
					pr_err("remap_pfn_range failed (1)\n");
					goto cleanup_mmap;
				}

				useraddr += PAGE_SIZE;
				vmalloc_area_ptr += PAGE_SIZE;
				mlength -= PAGE_SIZE;
			}

			ret = 0;
			goto cleanup_mmap;
		}

		pr_err("Invalid mmap size %ld\n", length);
		ret = -EIO;
		goto cleanup_mmap;
	}

	pr_err("invalid pgoff %lu, must be 0\n", vma->vm_pgoff);
	ret = -EIO;

cleanup_mmap:
	mutex_unlock(&g_consumer_mutex);

	return ret;
}

/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
static const unsigned char nas[21] = {
	AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
	AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
	AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
	AL(4), AL(5), AL(4)
};
#undef AL
#ifdef CONFIG_COMPAT
#define AL(x) ((x) * sizeof(compat_ulong_t))
static const unsigned char compat_nas[21] = {
	AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
	AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
	AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
	AL(4), AL(5), AL(4)
};
#undef AL
#endif


#ifdef _HAS_SOCKETCALL
static enum ppm_event_type parse_socketcall(struct event_filler_arguments *filler_args, struct pt_regs *regs)
{
	unsigned long __user args[6] = {};
	unsigned long __user *scargs;
	int socketcall_id;
	ppm_syscall_get_arguments(current, regs, args);
	socketcall_id = args[0];
	scargs = (unsigned long __user *)args[1];

	if (unlikely(socketcall_id < SYS_SOCKET ||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)
		socketcall_id > SYS_SENDMMSG))
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 33)
		socketcall_id > SYS_RECVMMSG))
#else
		socketcall_id > SYS_ACCEPT4))
#endif
		return PPME_GENERIC_E;

#ifdef CONFIG_COMPAT
	if (unlikely(filler_args->compat)) {
		compat_ulong_t socketcall_args32[6];
		int j;

		if (unlikely(ppm_copy_from_user(socketcall_args32, compat_ptr(args[1]), compat_nas[socketcall_id])))
			return PPME_GENERIC_E;
		for (j = 0; j < 6; ++j)
			filler_args->socketcall_args[j] = (unsigned long)socketcall_args32[j];
	} else {
#endif
		if (unlikely(ppm_copy_from_user(filler_args->socketcall_args, scargs, nas[socketcall_id])))
			return PPME_GENERIC_E;
#ifdef CONFIG_COMPAT
	}
#endif

	switch (socketcall_id) {
	case SYS_SOCKET:
		return PPME_SOCKET_SOCKET_E;
	case SYS_BIND:
		return PPME_SOCKET_BIND_E;
	case SYS_CONNECT:
		return PPME_SOCKET_CONNECT_E;
	case SYS_LISTEN:
		return PPME_SOCKET_LISTEN_E;
	case SYS_ACCEPT:
		return PPME_SOCKET_ACCEPT_5_E;
	case SYS_GETSOCKNAME:
		return PPME_SOCKET_GETSOCKNAME_E;
	case SYS_GETPEERNAME:
		return PPME_SOCKET_GETPEERNAME_E;
	case SYS_SOCKETPAIR:
		return PPME_SOCKET_SOCKETPAIR_E;
	case SYS_SEND:
		return PPME_SOCKET_SEND_E;
	case SYS_SENDTO:
		return PPME_SOCKET_SENDTO_E;
	case SYS_RECV:
		return PPME_SOCKET_RECV_E;
	case SYS_RECVFROM:
		return PPME_SOCKET_RECVFROM_E;
	case SYS_SHUTDOWN:
		return PPME_SOCKET_SHUTDOWN_E;
	case SYS_SETSOCKOPT:
		return PPME_SOCKET_SETSOCKOPT_E;
	case SYS_GETSOCKOPT:
		return PPME_SOCKET_GETSOCKOPT_E;
	case SYS_SENDMSG:
		return PPME_SOCKET_SENDMSG_E;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)
	case SYS_SENDMMSG:
		return PPME_SOCKET_SENDMMSG_E;
#endif
	case SYS_RECVMSG:
		return PPME_SOCKET_RECVMSG_E;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 33)
	case SYS_RECVMMSG:
		return PPME_SOCKET_RECVMMSG_E;
#endif
	case SYS_ACCEPT4:
		return PPME_SOCKET_ACCEPT4_5_E;
	default:
		ASSERT(false);
		return PPME_GENERIC_E;
	}
}
#endif /* _HAS_SOCKETCALL */

static inline void record_drop_e(struct ppm_consumer_t *consumer,
                                 nanoseconds ns,
                                 enum syscall_flags drop_flags)
{
	struct event_data_t event_data = {0};

	if (record_event_consumer(consumer, PPME_DROP_E, UF_NEVER_DROP, ns, &event_data) == 0) {
		consumer->need_to_insert_drop_e = 1;
	} else {
		if (consumer->need_to_insert_drop_e == 1 && !(drop_flags & UF_ATOMIC)) {
			pr_err("drop enter event delayed insert\n");
		}

		consumer->need_to_insert_drop_e = 0;
	}
}

static inline void record_drop_x(struct ppm_consumer_t *consumer,
                                 nanoseconds ns,
                                 enum syscall_flags drop_flags)
{
	struct event_data_t event_data = {0};

	if (record_event_consumer(consumer, PPME_DROP_X, UF_NEVER_DROP, ns, &event_data) == 0) {
		consumer->need_to_insert_drop_x = 1;
	} else {
		if (consumer->need_to_insert_drop_x == 1 && !(drop_flags & UF_ATOMIC)) {
			pr_err("drop exit event delayed insert\n");
		}

		consumer->need_to_insert_drop_x = 0;
	}
}

// Return 1 if the event should be dropped, else 0
static inline int drop_nostate_event(enum ppm_event_type event_type,
				     struct pt_regs *regs)
{
	unsigned long args[6] = {};
	unsigned long arg = 0;
	int close_fd = -1;
	struct files_struct *files;
	struct fdtable *fdt;
	bool drop = false;

	switch (event_type) {
	case PPME_SYSCALL_CLOSE_X:
	case PPME_SOCKET_BIND_X:
		if (syscall_get_return_value(current, regs) < 0)
			drop = true;
		break;
	case PPME_SYSCALL_CLOSE_E:
		/*
		 * It's annoying but valid for a program to make a large number of
		 * close() calls on nonexistent fds. That can cause driver cpu usage
		 * to spike dramatically, so drop close events if the fd is not valid.
		 *
		 * The invalid fd events don't matter to userspace in dropping mode,
		 * so we do this before the UF_NEVER_DROP check
		 */
		ppm_syscall_get_arguments(current, regs, args);
		arg = args[0];
		close_fd = (int)arg;

		files = current->files;
		spin_lock(&files->file_lock);
		fdt = files_fdtable(files);
		if (close_fd < 0 || close_fd >= fdt->max_fds ||
#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 4, 0))
		    !FD_ISSET(close_fd, fdt->open_fds)
#else
		    !fd_is_open(close_fd, fdt)
#endif
			) {
			drop = true;
		}
		spin_unlock(&files->file_lock);
		break;
	case PPME_SYSCALL_FCNTL_E:
	case PPME_SYSCALL_FCNTL_X:
		// cmd arg
		ppm_syscall_get_arguments(current, regs, args);
		arg = args[1];
		if (arg != F_DUPFD && arg != F_DUPFD_CLOEXEC)
			drop = true;
		break;
	default:
		break;
	}

	if (drop)
		return 1;
	else
		return 0;
}

// Return 1 if the event should be dropped, else 0
static inline int drop_event(struct ppm_consumer_t *consumer,
			     enum ppm_event_type event_type,
			     enum syscall_flags drop_flags,
			     nanoseconds ns,
			     struct pt_regs *regs)
{
	int maybe_ret = 0;

	if (consumer->dropping_mode) {
		maybe_ret = drop_nostate_event(event_type, regs);
		if (maybe_ret > 0)
			return maybe_ret;
	}

	if (drop_flags & UF_NEVER_DROP) {
		ASSERT((drop_flags & UF_ALWAYS_DROP) == 0);
		return 0;
	}

	if (consumer->dropping_mode) {
		if (drop_flags & UF_ALWAYS_DROP) {
			ASSERT((drop_flags & UF_NEVER_DROP) == 0);
			return 1;
		}

		nanoseconds rem;
		div64_u64_rem(ns, SECOND_IN_NS, &rem);
		if (consumer->sampling_interval < SECOND_IN_NS &&
		    rem >= consumer->sampling_interval) {
			if (consumer->is_dropping == 0) {
				consumer->is_dropping = 1;
				record_drop_e(consumer, ns, drop_flags);
			}

			return 1;
		}

		if (consumer->is_dropping == 1) {
			consumer->is_dropping = 0;
			record_drop_x(consumer, ns, drop_flags);
		}
	}

	return 0;
}

static void record_event_all_consumers(enum ppm_event_type event_type,
	enum syscall_flags drop_flags,
	struct event_data_t *event_datap)
{
	struct ppm_consumer_t *consumer;
	nanoseconds ns = ppm_nsecs();

	rcu_read_lock();
	list_for_each_entry_rcu(consumer, &g_consumer_list, node) {
		record_event_consumer(consumer, event_type, drop_flags, ns, event_datap);
	}
	rcu_read_unlock();
}

/*
 * Returns 0 if the event is dropped
 */
static int record_event_consumer(struct ppm_consumer_t *consumer,
	enum ppm_event_type event_type,
	enum syscall_flags drop_flags,
	nanoseconds ns,
	struct event_data_t *event_datap)
{
	int res = 0;
	size_t event_size = 0;
	int next;
	u32 freespace;
	u32 usedspace;
	u32 delta_from_end;
	struct event_filler_arguments args;
	u32 ttail;
	u32 head;
	struct ppm_ring_buffer_context *ring;
	struct ppm_ring_buffer_info *ring_info;
	int drop = 1;
	int32_t cbres = PPM_SUCCESS;
	int cpu;

	if (!test_bit(event_type, g_events_mask))
		return res;

	if (event_type != PPME_DROP_E && event_type != PPME_DROP_X) {
		if (consumer->need_to_insert_drop_e == 1)
			record_drop_e(consumer, ns, drop_flags);
		else if (consumer->need_to_insert_drop_x == 1)
			record_drop_x(consumer, ns, drop_flags);

		if (drop_event(consumer,
		               event_type,
		               drop_flags,
		               ns,
		               event_datap->event_info.syscall_data.regs))
			return res;
	}

	/*
	 * FROM THIS MOMENT ON, WE HAVE TO BE SUPER FAST
	 */
	cpu = get_cpu();
	ring = per_cpu_ptr(consumer->ring_buffers, cpu);
	ASSERT(ring);

	ring_info = ring->info;

	if (!ring->capture_enabled) {
		put_cpu();
		return res;
	}

	ring_info->n_evts++;
	if (event_datap->category == PPMC_CONTEXT_SWITCH && event_datap->event_info.context_data.sched_prev != NULL) {
		if (event_type != PPME_SYSDIGEVENT_E && event_type != PPME_CPU_HOTPLUG_E) {
			ASSERT(event_datap->event_info.context_data.sched_prev != NULL);
			ASSERT(event_datap->event_info.context_data.sched_next != NULL);
			ring_info->n_context_switches++;
		}
	}

	/*
	 * Preemption gate
	 */
	if (unlikely(atomic_inc_return(&ring->preempt_count) != 1)) {
		/* When this driver executing a filler calls ppm_copy_from_user(),
		 * even if the page fault is disabled, the page fault tracepoint gets
		 * called very early in the page fault handler, way before the kernel
		 * terminates it, so this is legit. Still not sure how to solve this,
		 * so for the moment handle this case by not complaining and ignoring
		 * the false alarm if the preemption exception is generated by
		 * page_fault_kernel. The alternative would be to disable the kernel
		 * tracepoint completely, but there is value in seeing page faults
		 * generated on this side, so let's see if someone complains.
		 * This means that effectively those events would be lost.
		 */
		if (event_type != PPME_PAGE_FAULT_E) {
			ring_info->n_preemptions++;
			ASSERT(false);
		}
		atomic_dec(&ring->preempt_count);
		put_cpu();
		return res;
	}

	/*
	 * Calculate the space currently available in the buffer
	 */
	head = ring_info->head;
	ttail = ring_info->tail;

	if (ttail > head)
		freespace = ttail - head - 1;
	else
		freespace = RING_BUF_SIZE + ttail - head - 1;

	usedspace = RING_BUF_SIZE - freespace - 1;
	delta_from_end = RING_BUF_SIZE + (2 * PAGE_SIZE) - head - 1;

	ASSERT(freespace <= RING_BUF_SIZE);
	ASSERT(usedspace <= RING_BUF_SIZE);
	ASSERT(ttail <= RING_BUF_SIZE);
	ASSERT(head <= RING_BUF_SIZE);
	ASSERT(delta_from_end < RING_BUF_SIZE + (2 * PAGE_SIZE));
	ASSERT(delta_from_end > (2 * PAGE_SIZE) - 1);
#ifdef _HAS_SOCKETCALL
	/*
	 * If this is a socketcall system call, determine the correct event type
	 * by parsing the arguments and patch event_type accordingly
	 * A bit of explanation: most linux architectures don't have a separate
	 * syscall for each of the socket functions (bind, connect...). Instead,
	 * the socket functions are aggregated into a single syscall, called
	 * socketcall. The first socketcall argument is the call type, while the
	 * second argument contains a pointer to the arguments of the original
	 * call. I guess this was done to reduce the number of syscalls...
	 */
	if (event_datap->category == PPMC_SYSCALL && event_datap->event_info.syscall_data.regs && event_datap->event_info.syscall_data.id == event_datap->socketcall_syscall) {
		enum ppm_event_type tet;

		args.is_socketcall = true;
		args.compat = event_datap->compat;
		tet = parse_socketcall(&args, event_datap->event_info.syscall_data.regs);

		if (event_type == PPME_GENERIC_E)
			event_type = tet;
		else
			event_type = tet + 1;

	} else {
		args.is_socketcall = false;
		args.compat = false;
	}

	args.socketcall_syscall = event_datap->socketcall_syscall;
#endif

	ASSERT(event_type < PPM_EVENT_MAX);

	/*
	 * Determine how many arguments this event has
	 */
	args.nargs = g_event_info[event_type].nparams;
	args.arg_data_offset = args.nargs * sizeof(u16);

	/*
	 * Make sure we have enough space for the event header.
	 * We need at least space for the header plus 16 bit per parameter for the lengths.
	 */
	if (likely(freespace >= sizeof(struct ppm_evt_hdr) + args.arg_data_offset)) {
		/*
		 * Populate the header
		 */
		struct ppm_evt_hdr *hdr = (struct ppm_evt_hdr *)(ring->buffer + head);

#ifdef PPM_ENABLE_SENTINEL
		hdr->sentinel_begin = ring->nevents;
#endif
		hdr->ts = ns;
		hdr->tid = current->pid;
		hdr->type = event_type;
		hdr->nparams = args.nargs;

		/*
		 * Populate the parameters for the filler callback
		 */
		args.consumer = consumer;
		args.buffer = ring->buffer + head + sizeof(struct ppm_evt_hdr);
#ifdef PPM_ENABLE_SENTINEL
		args.sentinel = ring->nevents;
#endif
		args.buffer_size = min(freespace, delta_from_end) - sizeof(struct ppm_evt_hdr); /* freespace is guaranteed to be bigger than sizeof(struct ppm_evt_hdr) */
		args.event_type = event_type;

		if (event_datap->category == PPMC_SYSCALL) {
			args.regs = event_datap->event_info.syscall_data.regs;
			args.syscall_id = event_datap->event_info.syscall_data.id;
			args.cur_g_syscall_code_routing_table = event_datap->event_info.syscall_data.cur_g_syscall_code_routing_table;
			args.compat = event_datap->compat;
		} else {
			args.regs = NULL;
			args.syscall_id = -1;
			args.cur_g_syscall_code_routing_table = NULL;
			args.compat = false;
		}

		if (event_datap->category == PPMC_CONTEXT_SWITCH) {
			args.sched_prev = event_datap->event_info.context_data.sched_prev;
			args.sched_next = event_datap->event_info.context_data.sched_next;
		} else {
			args.sched_prev = NULL;
			args.sched_next = NULL;
		}

		if (event_datap->category == PPMC_SIGNAL) {
			args.signo = event_datap->event_info.signal_data.sig;
			if (event_datap->event_info.signal_data.info == NULL) {
				args.spid = (__kernel_pid_t) 0;
			} else if (args.signo == SIGKILL) {
				args.spid = event_datap->event_info.signal_data.info->_sifields._kill._pid;
			} else if (args.signo == SIGTERM || args.signo == SIGHUP || args.signo == SIGINT ||
					args.signo == SIGTSTP || args.signo == SIGQUIT) {
				if (event_datap->event_info.signal_data.info->si_code == SI_USER ||
						event_datap->event_info.signal_data.info->si_code == SI_QUEUE ||
						event_datap->event_info.signal_data.info->si_code <= 0) {
					args.spid = event_datap->event_info.signal_data.info->si_pid;
				}
			} else if (args.signo == SIGCHLD) {
				args.spid = event_datap->event_info.signal_data.info->_sifields._sigchld._pid;
			} else if (args.signo >= SIGRTMIN && args.signo <= SIGRTMAX) {
				args.spid = event_datap->event_info.signal_data.info->_sifields._rt._pid;
			} else {
				args.spid = (__kernel_pid_t) 0;
			}
		} else {
			args.signo = 0;
			args.spid = (__kernel_pid_t) 0;
		}
		args.dpid = current->pid;

		if (event_datap->category == PPMC_PAGE_FAULT)
			args.fault_data = event_datap->event_info.fault_data;

		args.curarg = 0;
		args.arg_data_size = args.buffer_size - args.arg_data_offset;
		args.nevents = ring->nevents;
		args.str_storage = ring->str_storage;
		args.enforce_snaplen = false;

		/*
		 * Fire the filler callback
		 */
		if (likely(g_ppm_events[event_type].filler_callback)) {
			cbres = g_ppm_events[event_type].filler_callback(&args);
		} else {
			pr_err("corrupted filler for event type %d: NULL callback\n", event_type);
			ASSERT(0);
		}

		if (likely(cbres == PPM_SUCCESS)) {
			/*
			 * Validate that the filler added the right number of parameters
			 */
			if (likely(args.curarg == args.nargs)) {
				/*
				 * The event was successfully inserted in the buffer
				 */
				event_size = sizeof(struct ppm_evt_hdr) + args.arg_data_offset;
				hdr->len = event_size;
				drop = 0;
			} else {
				pr_err("corrupted filler for event type %d (added %u args, should have added %u)\n",
				       event_type,
				       args.curarg,
				       args.nargs);
				ASSERT(0);
			}
		}
	}

	if (likely(!drop)) {
		res = 1;

		next = head + event_size;

		if (unlikely(next >= RING_BUF_SIZE)) {
			/*
			 * If something has been written in the cushion space at the end of
			 * the buffer, copy it to the beginning and wrap the head around.
			 * Note, we don't check that the copy fits because we assume that
			 * filler_callback failed if the space was not enough.
			 */
			if (next > RING_BUF_SIZE) {
				memcpy(ring->buffer,
				ring->buffer + RING_BUF_SIZE,
				next - RING_BUF_SIZE);
			}

			next -= RING_BUF_SIZE;
		}

		/*
		 * Make sure all the memory has been written in real memory before
		 * we update the head and the user space process (on another CPU)
		 * can access the buffer.
		 */
		smp_wmb();

		ring_info->head = next;

		++ring->nevents;
	} else {
		if (cbres == PPM_SUCCESS) {
			ASSERT(freespace < sizeof(struct ppm_evt_hdr) + args.arg_data_offset);
			ring_info->n_drops_buffer++;
		} else if (cbres == PPM_FAILURE_INVALID_USER_MEMORY) {
#ifdef _DEBUG
			pr_err("Invalid read from user for event %d\n", event_type);
#endif
			ring_info->n_drops_pf++;
		} else if (cbres == PPM_FAILURE_BUFFER_FULL) {
			ring_info->n_drops_buffer++;
		} else {
			ASSERT(false);
		}
	}

	if (MORE_THAN_ONE_SECOND_AHEAD(ns, ring->last_print_time + 1) && !(drop_flags & UF_ATOMIC)) {
		vpr_info("consumer:%p CPU:%d, use:%d%%, ev:%llu, dr_buf:%llu, dr_pf:%llu, pr:%llu, cs:%llu\n",
			   consumer->consumer_id,
		       smp_processor_id(),
		       (usedspace * 100) / RING_BUF_SIZE,
		       ring_info->n_evts,
		       ring_info->n_drops_buffer,
		       ring_info->n_drops_pf,
		       ring_info->n_preemptions,
		       ring->info->n_context_switches);

		ring->last_print_time = ns;
	}

	atomic_dec(&ring->preempt_count);
	put_cpu();

	return res;
}

static inline void g_n_tracepoint_hit_inc(void)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 34)
	this_cpu_inc(g_n_tracepoint_hit);
#elif defined(this_cpu_inc)
	/* this_cpu_inc has been added with 2.6.33 but backported by RHEL/CentOS to 2.6.32
	 * so just checking the existence of the symbol rather than matching the kernel version
	 * https://github.com/torvalds/linux/commit/7340a0b15280c9d902c7dd0608b8e751b5a7c403
	 *
	 * per_cpu_var removed with:
	 * https://github.com/torvalds/linux/commit/dd17c8f72993f9461e9c19250e3f155d6d99df22
	 */
	this_cpu_inc(per_cpu_var(g_n_tracepoint_hit));
#endif
}

TRACEPOINT_PROBE(syscall_enter_probe, struct pt_regs *regs, long id)
{
	long table_index;
	const struct syscall_evt_pair *cur_g_syscall_table = g_syscall_table;
	const enum ppm_syscall_code *cur_g_syscall_code_routing_table = g_syscall_code_routing_table;
	bool compat = false;
#ifdef __NR_socketcall
	int socketcall_syscall = __NR_socketcall;
#else
	int socketcall_syscall = -1;
#endif

#if defined(CONFIG_X86_64) && defined(CONFIG_IA32_EMULATION)
	/*
	 * If this is a 32bit process running on a 64bit kernel (see the CONFIG_IA32_EMULATION
	 * kernel flag), we switch to the ia32 syscall table.
	 */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)
	if (in_ia32_syscall()) {
#else
	if (unlikely(task_thread_info(current)->status & TS_COMPAT)) {
#endif
		cur_g_syscall_table = g_syscall_ia32_table;
		cur_g_syscall_code_routing_table = g_syscall_ia32_code_routing_table;
		socketcall_syscall = __NR_ia32_socketcall;
		compat = true;
	}
#endif

	g_n_tracepoint_hit_inc();

	table_index = id - SYSCALL_TABLE_ID0;
	if (likely(table_index >= 0 && table_index < SYSCALL_TABLE_SIZE)) {
		struct event_data_t event_data;
		int used = cur_g_syscall_table[table_index].flags & UF_USED;
		enum syscall_flags drop_flags = cur_g_syscall_table[table_index].flags;
		enum ppm_event_type type;

		/*
		 * Simple mode event filtering
		 */
		if (g_simple_mode_enabled) {
			if ((drop_flags & UF_SIMPLEDRIVER_KEEP) == 0) {
				return;
			}
		}

#ifdef _HAS_SOCKETCALL
		if (id == socketcall_syscall) {
			used = true;
			drop_flags = UF_NEVER_DROP;
			type = PPME_GENERIC_E;
		} else
			type = cur_g_syscall_table[table_index].enter_event_type;
#else
		type = cur_g_syscall_table[table_index].enter_event_type;
#endif

		event_data.category = PPMC_SYSCALL;
		event_data.event_info.syscall_data.regs = regs;
		event_data.event_info.syscall_data.id = id;
		event_data.event_info.syscall_data.cur_g_syscall_code_routing_table = cur_g_syscall_code_routing_table;
		event_data.socketcall_syscall = socketcall_syscall;
		event_data.compat = compat;

		if (used)
			record_event_all_consumers(type, drop_flags, &event_data);
		else
			record_event_all_consumers(PPME_GENERIC_E, UF_ALWAYS_DROP, &event_data);
	}
}

TRACEPOINT_PROBE(syscall_exit_probe, struct pt_regs *regs, long ret)
{
	int id;
	long table_index;
	const struct syscall_evt_pair *cur_g_syscall_table = g_syscall_table;
	const enum ppm_syscall_code *cur_g_syscall_code_routing_table = g_syscall_code_routing_table;
	bool compat = false;
#ifdef __NR_socketcall
	int socketcall_syscall = __NR_socketcall;
#else
	int socketcall_syscall = -1;
#endif

	id = syscall_get_nr(current, regs);

#if defined(CONFIG_X86_64) && defined(CONFIG_IA32_EMULATION)
	/*
	 * When a process does execve from 64bit to 32bit, TS_COMPAT is marked true
	 * but the id of the syscall is __NR_execve, so to correctly parse it we need to
	 * use 64bit syscall table. On 32bit __NR_execve is equal to __NR_ia32_oldolduname
	 * which is a very old syscall, not used anymore by most applications
	 */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)
	if (in_ia32_syscall() && id != __NR_execve) {
#else
	if (unlikely((task_thread_info(current)->status & TS_COMPAT) && id != __NR_execve)) {
#endif
		cur_g_syscall_table = g_syscall_ia32_table;
		cur_g_syscall_code_routing_table = g_syscall_ia32_code_routing_table;
		socketcall_syscall = __NR_ia32_socketcall;
		compat = true;
	}
#endif

	g_n_tracepoint_hit_inc();

	table_index = id - SYSCALL_TABLE_ID0;
	if (likely(table_index >= 0 && table_index < SYSCALL_TABLE_SIZE)) {
		struct event_data_t event_data;
		int used = cur_g_syscall_table[table_index].flags & UF_USED;
		enum syscall_flags drop_flags = cur_g_syscall_table[table_index].flags;
		enum ppm_event_type type;

		/*
		 * Simple mode event filtering
		 */
		if (g_simple_mode_enabled) {
			if ((drop_flags & UF_SIMPLEDRIVER_KEEP) == 0) {
				return;
			}
		}

#ifdef _HAS_SOCKETCALL
		if (id == socketcall_syscall) {
			used = true;
			drop_flags = UF_NEVER_DROP;
			type = PPME_GENERIC_X;
		} else
			type = cur_g_syscall_table[table_index].exit_event_type;
#else
		type = cur_g_syscall_table[table_index].exit_event_type;
#endif

		event_data.category = PPMC_SYSCALL;
		event_data.event_info.syscall_data.regs = regs;
		event_data.event_info.syscall_data.id = id;
		event_data.event_info.syscall_data.cur_g_syscall_code_routing_table = cur_g_syscall_code_routing_table;
		event_data.socketcall_syscall = socketcall_syscall;
		event_data.compat = compat;

		if (used)
			record_event_all_consumers(type, drop_flags, &event_data);
		else
			record_event_all_consumers(PPME_GENERIC_X, UF_ALWAYS_DROP, &event_data);
	}
}

#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 1)
int __access_remote_vm(struct task_struct *t, struct mm_struct *mm, unsigned long addr,
		       void *buf, int len, int write);
#endif

TRACEPOINT_PROBE(syscall_procexit_probe, struct task_struct *p)
{
	struct event_data_t event_data;

	g_n_tracepoint_hit_inc();

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
	if (unlikely(current->flags & PF_KTHREAD)) {
#else
	if (unlikely(current->flags & PF_BORROWED_MM)) {
#endif
		/*
		 * We are not interested in kernel threads
		 */
		return;
	}

	event_data.category = PPMC_CONTEXT_SWITCH;
	event_data.event_info.context_data.sched_prev = p;
	event_data.event_info.context_data.sched_next = p;

	record_event_all_consumers(PPME_PROCEXIT_1_E, UF_NEVER_DROP, &event_data);
}

#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/udp.h>

#ifdef CAPTURE_CONTEXT_SWITCHES
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 35))
TRACEPOINT_PROBE(sched_switch_probe, struct rq *rq, struct task_struct *prev, struct task_struct *next)
#elif (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0))
TRACEPOINT_PROBE(sched_switch_probe, struct task_struct *prev, struct task_struct *next)
#else
TRACEPOINT_PROBE(sched_switch_probe, bool preempt, struct task_struct *prev, struct task_struct *next)
#endif
{
	struct event_data_t event_data;

	g_n_tracepoint_hit_inc();

	event_data.category = PPMC_CONTEXT_SWITCH;
	event_data.event_info.context_data.sched_prev = prev;
	event_data.event_info.context_data.sched_next = next;

	/*
	 * Need to indicate ATOMIC (i.e. interrupt) context to avoid the event
	 * handler calling printk() and potentially deadlocking the system.
	 */
	record_event_all_consumers(PPME_SCHEDSWITCH_6_E, UF_USED | UF_ATOMIC, &event_data);
}
#endif

#ifdef CAPTURE_SIGNAL_DELIVERIES

static __always_inline int siginfo_not_a_pointer(struct siginfo* info)
{
#ifdef SEND_SIG_FORCED
	return info == SEND_SIG_NOINFO || info == SEND_SIG_PRIV || SEND_SIG_FORCED;
#else
	return info == (struct siginfo*)SEND_SIG_NOINFO || info == (struct siginfo*)SEND_SIG_PRIV;
#endif
}

TRACEPOINT_PROBE(signal_deliver_probe, int sig, struct siginfo *info, struct k_sigaction *ka)
{
	struct event_data_t event_data;

	g_n_tracepoint_hit_inc();

	event_data.category = PPMC_SIGNAL;
	event_data.event_info.signal_data.sig = sig;
	if (siginfo_not_a_pointer(info))
		event_data.event_info.signal_data.info = NULL;
	else
		event_data.event_info.signal_data.info = info;
	event_data.event_info.signal_data.ka = ka;

	record_event_all_consumers(PPME_SIGNALDELIVER_E, UF_USED | UF_ALWAYS_DROP, &event_data);
}
#endif

#ifdef CAPTURE_PAGE_FAULTS
TRACEPOINT_PROBE(page_fault_probe, unsigned long address, struct pt_regs *regs, unsigned long error_code)
{
	struct event_data_t event_data;

	/* We register both tracepoints under the same probe and
	 * sysdig event since there's little reason to expose this
	 * complexity to the sysdig user. The distinction can still be made
	 * in the output by looking for the USER_FAULT/SUPERVISOR_FAULT
	 * flags
	 */
	g_n_tracepoint_hit_inc();

	/* I still haven't decided if I'm interested in kernel threads or not.
	 * For the moment, I assume yes since I can see some value for it.
	 */

	event_data.category = PPMC_PAGE_FAULT;
	event_data.event_info.fault_data.address = address;
	event_data.event_info.fault_data.regs = regs;
	event_data.event_info.fault_data.error_code = error_code;

	record_event_all_consumers(PPME_PAGE_FAULT_E, UF_ALWAYS_DROP, &event_data);
}
#endif

static int init_ring_buffer(struct ppm_ring_buffer_context *ring)
{
	unsigned int j;

	/*
	 * Allocate the string storage in the ring descriptor
	 */
	ring->str_storage = (char *)__get_free_page(GFP_USER);
	if (!ring->str_storage) {
		pr_err("Error allocating the string storage\n");
		goto init_ring_err;
	}

	/*
	 * Allocate the buffer.
	 * Note how we allocate 2 additional pages: they are used as additional overflow space for
	 * the event data generation functions, so that they always operate on a contiguous buffer.
	 */
	ring->buffer = vmalloc(RING_BUF_SIZE + 2 * PAGE_SIZE);
	if (ring->buffer == NULL) {
		pr_err("Error allocating ring memory\n");
		goto init_ring_err;
	}

	for (j = 0; j < RING_BUF_SIZE + 2 * PAGE_SIZE; j++)
		ring->buffer[j] = 0;

	/*
	 * Allocate the buffer info structure
	 */
	ring->info = vmalloc(sizeof(struct ppm_ring_buffer_info));
	if (ring->info == NULL) {
		pr_err("Error allocating ring memory\n");
		goto init_ring_err;
	}

	/*
	 * Initialize the buffer info structure
	 */
	reset_ring_buffer(ring);
	atomic_set(&ring->preempt_count, 0);

	pr_info("CPU buffer initialized, size=%d\n", RING_BUF_SIZE);

	return 1;

init_ring_err:
	free_ring_buffer(ring);
	return 0;
}

static void free_ring_buffer(struct ppm_ring_buffer_context *ring)
{
	if (ring->info) {
		vfree(ring->info);
		ring->info = NULL;
	}

	if (ring->buffer) {
		vfree((void *)ring->buffer);
		ring->buffer = NULL;
	}

	if (ring->str_storage) {
		free_page((unsigned long)ring->str_storage);
		ring->str_storage = NULL;
	}
}

static void reset_ring_buffer(struct ppm_ring_buffer_context *ring)
{
	/*
	 * ring->preempt_count is not reset to 0 on purpose, to prevent a race condition
	 * see ppm_open
	 */
	ring->open = false;
	ring->capture_enabled = false;
	ring->info->head = 0;
	ring->info->tail = 0;
	ring->nevents = 0;
	ring->info->n_evts = 0;
	ring->info->n_drops_buffer = 0;
	ring->info->n_drops_pf = 0;
	ring->info->n_preemptions = 0;
	ring->info->n_context_switches = 0;
	ring->last_print_time = ppm_nsecs();
}

#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0))
static void visit_tracepoint(struct tracepoint *tp, void *priv)
{
	if (!strcmp(tp->name, "sys_enter"))
		tp_sys_enter = tp;
	else if (!strcmp(tp->name, "sys_exit"))
		tp_sys_exit = tp;
	else if (!strcmp(tp->name, "sched_process_exit"))
		tp_sched_process_exit = tp;
#ifdef CAPTURE_CONTEXT_SWITCHES
	else if (!strcmp(tp->name, "sched_switch"))
		tp_sched_switch = tp;
#endif
#ifdef CAPTURE_SIGNAL_DELIVERIES
	else if (!strcmp(tp->name, "signal_deliver"))
		tp_signal_deliver = tp;
#endif
#ifdef CAPTURE_PAGE_FAULTS
	else if (!strcmp(tp->name, "page_fault_user"))
		tp_page_fault_user = tp;
	else if (!strcmp(tp->name, "page_fault_kernel"))
		tp_page_fault_kernel = tp;
#endif
}

static int get_tracepoint_handles(void)
{
	for_each_kernel_tracepoint(visit_tracepoint, NULL);

	if (!tp_sys_enter) {
		pr_err("failed to find sys_enter tracepoint\n");
		return -ENOENT;
	}
	if (!tp_sys_exit) {
		pr_err("failed to find sys_exit tracepoint\n");
		return -ENOENT;
	}
	if (!tp_sched_process_exit) {
		pr_err("failed to find sched_process_exit tracepoint\n");
		return -ENOENT;
	}
#ifdef CAPTURE_CONTEXT_SWITCHES
	if (!tp_sched_switch) {
		pr_err("failed to find sched_switch tracepoint\n");
		return -ENOENT;
	}
#endif
#ifdef CAPTURE_SIGNAL_DELIVERIES
	if (!tp_signal_deliver) {
		pr_err("failed to find signal_deliver tracepoint\n");
		return -ENOENT;
	}
#endif
#ifdef CAPTURE_PAGE_FAULTS
	if (!tp_page_fault_user) {
		pr_notice("failed to find page_fault_user tracepoint, disabling page-faults\n");
		g_fault_tracepoint_disabled = true;
	}
	if (!tp_page_fault_kernel) {
		pr_notice("failed to find page_fault_kernel tracepoint, disabling page-faults\n");
		g_fault_tracepoint_disabled = true;
	}
#endif

	return 0;
}
#else
static int get_tracepoint_handles(void)
{
	return 0;
}
#endif

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 2, 0)
static char *ppm_devnode(const struct device *dev, umode_t *mode)
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0)
static char *ppm_devnode(struct device *dev, umode_t *mode)
#else
static char *ppm_devnode(struct device *dev, mode_t *mode)
#endif
{
	if (mode) {
		*mode = 0400;

		if (dev)
			if (MINOR(dev->devt) == g_ppm_numdevs)
				*mode = 0222;
	}

	return NULL;
}
#endif /* LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20) */

static int do_cpu_callback(unsigned long cpu, long sd_action)
{
	struct ppm_ring_buffer_context *ring;
	struct ppm_consumer_t *consumer;
	struct event_data_t event_data;

	if (sd_action != 0) {
		rcu_read_lock();

		list_for_each_entry_rcu(consumer, &g_consumer_list, node) {
			ring = per_cpu_ptr(consumer->ring_buffers, cpu);
			if (sd_action == 1) {
				/*
				 * If the cpu was offline when the consumer was created,
				 * this won't do anything because we never created a ring
				 * buffer. We can't safely create one here because we're
				 * in atomic context, and the consumer needs to call open
				 * on this device anyways, so do it in ppm_open.
				 */
				ring->cpu_online = true;
			} else if (sd_action == 2) {
				ring->cpu_online = false;
			}
		}

		rcu_read_unlock();

		event_data.category = PPMC_CONTEXT_SWITCH;
		event_data.event_info.context_data.sched_prev = (void *)cpu;
		event_data.event_info.context_data.sched_next = (void *)sd_action;
		record_event_all_consumers(PPME_CPU_HOTPLUG_E, UF_NEVER_DROP, &event_data);
	}
	return 0;
}

#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0))
static int sysdig_cpu_online(unsigned int cpu)
{
	vpr_info("sysdig_cpu_online on cpu %d\n", cpu);
	return do_cpu_callback(cpu, 1);
}

static int sysdig_cpu_offline(unsigned int cpu)
{
	vpr_info("sysdig_cpu_offline on cpu %d\n", cpu);
	return do_cpu_callback(cpu, 2);
}
#else /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)) */
/*
 * This gets called every time a CPU is added or removed
 */
static int cpu_callback(struct notifier_block *self, unsigned long action,
			void *hcpu)
{
	unsigned long cpu = (unsigned long)hcpu;
	long sd_action = 0;

	switch (action) {
	case CPU_UP_PREPARE:
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
	case CPU_UP_PREPARE_FROZEN:
#endif
		sd_action = 1;
		break;
	case CPU_DOWN_PREPARE:
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
	case CPU_DOWN_PREPARE_FROZEN:
#endif
		sd_action = 2;
		break;
	default:
		break;
	}

	if (do_cpu_callback(cpu, sd_action) < 0)
		return NOTIFY_BAD;
	else
		return NOTIFY_OK;
}

static struct notifier_block cpu_notifier = {
	.notifier_call = &cpu_callback,
	.next = NULL,
};
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) */

int sysdig_init(void)
{
	dev_t dev;
	unsigned int cpu;
	unsigned int num_cpus;
	int ret;
	int acrret = 0;
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0))
	int hp_ret;
#endif
	int j;
	int n_created_devices = 0;
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
	struct device *device = NULL;
#else
	struct class_device *device = NULL;
#endif
	pr_info("driver loading, " PROBE_NAME " " PROBE_VERSION "\n");

	ret = get_tracepoint_handles();
	if (ret < 0)
		goto init_module_err;

	num_cpus = 0;
	for_each_possible_cpu(cpu) {
		++num_cpus;
	}

	/*
	 * Initialize the user I/O
	 * ( + 1 for sysdig-events)
	 */
	acrret = alloc_chrdev_region(&dev, 0, num_cpus + 1, PROBE_DEVICE_NAME);
	if (acrret < 0) {
		pr_err("could not allocate major number for %s\n", PROBE_DEVICE_NAME);
		ret = -ENOMEM;
		goto init_module_err;
	}

#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0)
	g_ppm_class = class_create(PROBE_DEVICE_NAME);
#else
	g_ppm_class = class_create(THIS_MODULE, PROBE_DEVICE_NAME);
#endif
	if (IS_ERR(g_ppm_class)) {
		pr_err("can't allocate device class\n");
		ret = -EFAULT;
		goto init_module_err;
	}

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
	g_ppm_class->devnode = ppm_devnode;
#endif

	g_ppm_major = MAJOR(dev);
	g_ppm_numdevs = num_cpus;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 4, 0)
	g_ppm_devs = kmalloc(g_ppm_numdevs * sizeof(struct ppm_device), GFP_KERNEL);
#else
	g_ppm_devs = kmalloc_array(g_ppm_numdevs, sizeof(struct ppm_device), GFP_KERNEL);
#endif
	if (!g_ppm_devs) {
		pr_err("can't allocate devices\n");
		ret = -ENOMEM;
		goto init_module_err;
	}

	/*
	 * We create a unique user level device for each of the ring buffers
	 */
	for (j = 0; j < g_ppm_numdevs; ++j) {
		cdev_init(&g_ppm_devs[j].cdev, &g_ppm_fops);
		g_ppm_devs[j].dev = MKDEV(g_ppm_major, j);

		if (cdev_add(&g_ppm_devs[j].cdev, g_ppm_devs[j].dev, 1) < 0) {
			pr_err("could not allocate chrdev for %s\n", PROBE_DEVICE_NAME);
			ret = -EFAULT;
			goto init_module_err;
		}

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
		device = device_create(
#else
		device = class_device_create(
#endif
						g_ppm_class, NULL, /* no parent device */
						g_ppm_devs[j].dev,
						NULL, /* no additional data */
						PROBE_DEVICE_NAME "%d",
						j);

		if (IS_ERR(device)) {
			pr_err("error creating the device for  %s\n", PROBE_DEVICE_NAME);
			cdev_del(&g_ppm_devs[j].cdev);
			ret = -EFAULT;
			goto init_module_err;
		}

		init_waitqueue_head(&g_ppm_devs[j].read_queue);
		n_created_devices++;
	}

	/* create_proc_read_entry(PPM_DEVICE_NAME, 0, NULL, ppm_read_proc, NULL); */

	/*
	 * Snaplen lookahead initialization
	 */
	if (dpi_lookahead_init() != PPM_SUCCESS) {
		pr_err("initializing lookahead-based snaplen failed\n");
		ret = -EFAULT;
		goto init_module_err;
	}

	/*
	 * Set up our callback in case we get a hotplug even while we are
	 * initializing the cpu structures
	 */
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0))
	hp_ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
					   "sysdig/probe:online",
					   sysdig_cpu_online,
					   sysdig_cpu_offline);
	if (hp_ret <= 0) {
		pr_err("error registering cpu hotplug callback\n");
		ret = hp_ret;
		goto init_module_err;
	}
	hp_state = hp_ret;
#else
	register_cpu_notifier(&cpu_notifier);
#endif

	/*
	 * All ok. Final initializations.
	 */
	g_tracepoint_registered = false;

	return 0;

init_module_err:
	for (j = 0; j < n_created_devices; ++j) {
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
		device_destroy(
#else
		class_device_destroy(
#endif
				g_ppm_class, g_ppm_devs[j].dev);

		cdev_del(&g_ppm_devs[j].cdev);
	}

	if (g_ppm_class)
		class_destroy(g_ppm_class);

	if (acrret == 0)
		unregister_chrdev_region(dev, g_ppm_numdevs);

	kfree(g_ppm_devs);

	return ret;
}

void sysdig_exit(void)
{
	int j;

	pr_info("driver unloading\n");

	for (j = 0; j < g_ppm_numdevs; ++j) {
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
		device_destroy(
#else
		class_device_destroy(
#endif
				g_ppm_class, g_ppm_devs[j].dev);
		cdev_del(&g_ppm_devs[j].cdev);
	}

	if (g_ppm_class)
		class_destroy(g_ppm_class);

	/* + 1 for sysdig-events */
	unregister_chrdev_region(MKDEV(g_ppm_major, 0), g_ppm_numdevs + 1);

	kfree(g_ppm_devs);

#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
	tracepoint_synchronize_unregister();
#endif

#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0))
	if (hp_state > 0)
		cpuhp_remove_state_nocalls(hp_state);
#else
	unregister_cpu_notifier(&cpu_notifier);
#endif
}

module_init(sysdig_init);
module_exit(sysdig_exit);
module_param(max_consumers, uint, 0444);
MODULE_PARM_DESC(max_consumers, "Maximum number of consumers that can simultaneously open the devices");
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 20)
module_param(verbose, bool, 0444);
#endif
MODULE_PARM_DESC(verbose, "Enable verbose logging");