红联Linux门户
Linux帮助

Linux 2.6.18.8内核中netfilter分析

发布时间:2015-02-12 22:48:53来源:linux网站作者:cyliu

1,简单介绍

在2.6.16内核的netfilter中,netfilter一个重大修正思想就是将netfilter作为一个协议无关的框架,表现在内核结构树中单独建立net/netfilter目录,而在以前netfilter是附着在各个协议目录之下的,如在net/ipv4, net/ipv6等目录下。现在虽然各协议目录下也都有,但主要是处理和各协议相关的东西了,而一些共同的东西,就都放在net/netfilter目录下,文件名也有所改变,虽然现在还不是很独立,比如说net/netfilter/nf_conntrack_core.c和net/ipv4/netfilter/ip_conntrack_core.c就仍然很相似,让人觉得没必要那么分,但不少和协议无关的匹配和目标模块已经和协议分离,只在此目录下有,而不放在协议目录下了。

在net/netfilter下的匹配和目标模块文件名称都以“xt_”打头,如 xt_comment.c,xt_policy.c等

目标模块有:

xt_CLASSIFY.c
xt_NFQUEUE.c
xt_NOTRACK.c

为了和iptables兼容(因为iptables找模块文件前缀是按“ipt_”或“ip6t_”找的),这些文件中增加了一个新的宏定义:MODULE_ALIAS,来表示模块的别名。

如在xt_limit.c中就如下定义:
MODULE_ALIAS("ipt_limit");
MODULE_ALIAS("ip6t_limit");

在include/linux/netfilter_ipv4/ip_tables.h中进行了以下定义:
#define ipt_match xt_match
#define ipt_target xt_target
#define ipt_table xt_table


2,代码分析

以下是新匹配和目标模块的结构定义:
struct xt_match
{
struct list_head list;

const char name[XT_FUNCTION_MAXNAMELEN-1];

/* Return true or false: return FALSE and set *hotdrop = 1 to
force immediate packet drop. */
/* Arguments changed since 2.6.9, as this must now handle
non-linear skb, using skb_header_pointer and
skb_ip_make_writable. */
int (*match)(const struct sk_buff *skb,
 const struct net_device *in,
 const struct net_device *out,
 const struct xt_match *match,
 const void *matchinfo,
 int offset,
 unsigned int protoff,
 int *hotdrop);

/* Called when user tries to insert an entry of this type. */
/* Should return true or false. */
int (*checkentry)(const char *tablename,
const void *ip,
const struct xt_match *match,
void *matchinfo,
unsigned int matchinfosize,
unsigned int hook_mask);

/* Called when entry of this type deleted. */
void (*destroy)(const struct xt_match *match, void *matchinfo,
  unsigned int matchinfosize);

/* Called when userspace align differs from kernel space one */
int (*compat)(void *match, void **dstptr, int *size, int convert);

/* Set this to THIS_MODULE if you are a module, otherwise NULL */
struct module *me;

char *table;
unsigned int matchsize;
unsigned int hooks;
unsigned short proto;

unsigned short family;
u_int8_t revision;
};

/* Registration hooks for targets. */
struct xt_target
{
struct list_head list;

const char name[XT_FUNCTION_MAXNAMELEN-1];

/* Returns verdict. Argument order changed since 2.6.9, as this
must now handle non-linear skbs, using skb_copy_bits and
skb_ip_make_writable. */
unsigned int (*target)(struct sk_buff **pskb,
 const struct net_device *in,
 const struct net_device *out,
 unsigned int hooknum,
 const struct xt_target *target,
 const void *targinfo,
 void *userdata);

/* Called when user tries to insert an entry of this type:
hook_mask is a bitmask of hooks from which it can be
called. */
/* Should return true or false. */
int (*checkentry)(const char *tablename,
const void *entry,
const struct xt_target *target,
void *targinfo,
unsigned int targinfosize,
unsigned int hook_mask);

/* Called when entry of this type deleted. */
void (*destroy)(const struct xt_target *target, void *targinfo,
  unsigned int targinfosize);

/* Called when userspace align differs from kernel space one */
int (*compat)(void *target, void **dstptr, int *size, int convert);

/* Set this to THIS_MODULE if you are a module, otherwise NULL */
struct module *me;

char *table;
unsigned int targetsize;
unsigned int hooks;
unsigned short proto;

unsigned short family;
u_int8_t revision;
};

/* Furniture shopping... */
struct xt_table
{
struct list_head list;

/* A unique name... */
char name[XT_TABLE_MAXNAMELEN];

/* What hooks you will enter on */
unsigned int valid_hooks;

/* Lock for the curtain */
rwlock_t lock;

/* Man behind the curtain... */
//struct ip6t_table_info *private;
void *private;

/* Set this to THIS_MODULE if you are a module, otherwise NULL */
struct module *me;

int af;  /* address/protocol family */
};

/* The table itself */
struct xt_table_info
{
/* Size per table */
unsigned int size;
/* Number of entries: FIXME. --RR */
unsigned int number;
/* Initial number of entries. Needed for module usage count */
unsigned int initial_entries;

/* Entry points and underflows */
unsigned int hook_entry[NF_IP_NUMHOOKS];
unsigned int underflow[NF_IP_NUMHOOKS];

/* ipt_entry tables: one per CPU */
char *entries[NR_CPUS];
};

/* 主要结构 */
struct xt_af {
struct mutex mutex;
struct list_head match;
struct list_head target;
struct list_head tables;
struct mutex compat_mutex;
};

/*数据结构的管理模块 */
static struct xt_af *xt;

/* netfilter模块初始化*/
static int __init xt_init(void)
{
int i;

/* 每种协议分配一个资源 */
xt = kmalloc(sizeof(struct xt_af) * NPROTO, GFP_KERNEL);
if (!xt)
 return -ENOMEM;

for (i = 0; i < NPROTO; i++) {
 mutex_init(&xt.mutex);
#ifdef CONFIG_COMPAT
 mutex_init(&xt.compat_mutex);
#endif

 /* 初始化table, target, match资源 */
 INIT_LIST_HEAD(&xt.target);
 INIT_LIST_HEAD(&xt.match);
 INIT_LIST_HEAD(&xt.tables);
}
return 0;
}

目前2.6.16内核中支持了三类协议族,IPv4/IPv6/ARP,在各协议族中查找相应模块用的前缀为:
static const char *xt_prefix[NPROTO] = {
[AF_INET] = "ip",
[AF_INET6] = "ip6",
[NF_ARP] = "arp",
};
对应的具体前缀分别为“ipt”、“ip6t”、“arpt”。

而和老的2.4内核的struct ipt_match和struct ipt_target结构的主要区别是增加了compat函数,以及struct modulde *me参数后面的一系列参数,是和协议相关的,比如limit匹配,分别为ipv4和ipv6定义了匹配结构后,只有family参数不同,一个是AF_INET,另一个是AF_INET6,其他都相同,而挂接时并不会有问题,因为这些模块都分别挂接到不同协议族的链表:

/* Registration hooks for targets. */
int
xt_register_target(struct xt_target *target)
{
int ret, af = target->family;

ret = mutex_lock_interruptible(&xt[af].mutex);
if (ret != 0)
 return ret;
 
/* 添加 target*/
list_add(&target->list, &xt[af].target);
mutex_unlock(&xt[af].mutex);
return ret;
}

int
xt_register_match(struct xt_match *match)
{
int ret, af = match->family;

ret = mutex_lock_interruptible(&xt[af].mutex);
if (ret != 0)
 return ret;
/* 添加match */
list_add(&match->list, &xt[af].match);
mutex_unlock(&xt[af].mutex);

return ret;
}

table注册发生在各协议的netfilte中:

int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl)
{
int ret;
struct xt_table_info *newinfo;
static struct xt_table_info bootstrap
 = { 0, 0, 0, { 0 }, { 0 }, { } };
void *loc_cpu_entry;

newinfo = xt_alloc_table_info(repl->size);
if (!newinfo)
 return -ENOMEM;

/* choose the copy on our node/cpu
 * but dont care of preemption
 */
loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
memcpy(loc_cpu_entry, repl->entries, repl->size);

ret = translate_table(table->name, table->valid_hooks,
 newinfo, loc_cpu_entry, repl->size,
 repl->num_entries,
 repl->hook_entry,
 repl->underflow);
if (ret != 0) {
 xt_free_table_info(newinfo);
 return ret;
}

if (xt_register_table(table, &bootstrap, newinfo) != 0) {
 xt_free_table_info(newinfo);
 return ret;
}

return 0;
}

/* 分配table_info资源,注意这里是每个cpu会对应一个entry */
struct xt_table_info *xt_alloc_table_info(unsigned int size)
{
struct xt_table_info *newinfo;
int cpu;

/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages) /*超过物理内存空间*/
 return NULL;

newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL);
if (!newinfo)
 return NULL;

newinfo->size = size;

for_each_possible_cpu(cpu) {/* 遍历每个cpu */
 if (size <= PAGE_SIZE)
  newinfo->entries[cpu] = kmalloc_node(size,
GFP_KERNEL,
cpu_to_node(cpu)); /* 直接分配物理空间 */
 else
  newinfo->entries[cpu] = vmalloc_node(size,
cpu_to_node(cpu));/* 分配虚拟空间*/

 if (newinfo->entries[cpu] == NULL) {
  xt_free_table_info(newinfo);
  return NULL;
 }
}

return newinfo;
}

int xt_register_table(struct xt_table *table,
struct xt_table_info *bootstrap,
struct xt_table_info *newinfo)
{
int ret;
struct xt_table_info *private;

ret = mutex_lock_interruptible(&xt[table->af].mutex);
if (ret != 0)
 return ret;

/* Don't autoload: we'd eat our tail... */
if (list_named_find(&xt[table->af].tables, table->name)) {
 ret = -EEXIST;
 goto unlock;
}

/* Simplifies replace_table code. */
table->private = bootstrap;
rwlock_init(&table->lock);
if (!xt_replace_table(table, 0, newinfo, &ret))
 goto unlock;

private = table->private;
duprintf("table->private->number = %u\n", private->number);

/* save number of initial entries */
private->initial_entries = private->number;

list_prepend(&xt[table->af].tables, table);

ret = 0;
unlock:
mutex_unlock(&xt[table->af].mutex);
return ret;
}

但在进行实际匹配目标查找时会进行名字、协议族、表名、挂接点、协议等的比较,如匹配的检查
int xt_check_match(const struct xt_match *match, unsigned short family,
unsigned int size, const char *table, unsigned int hook_mask,
 unsigned short proto, int inv_proto)
{
if (XT_ALIGN(match->matchsize) != size) {
 printk("%s_tables: %s match: invalid size %Zu != %u\n",
xt_prefix[family], match->name,
XT_ALIGN(match->matchsize), size);
 return -EINVAL;
}
if (match->table && strcmp(match->table, table)) {
 printk("%s_tables: %s match: only valid in %s table, not %s\n",
xt_prefix[family], match->name, match->table, table);
 return -EINVAL;
}
if (match->hooks && (hook_mask & ~match->hooks) != 0) {
 printk("%s_tables: %s match: bad hook_mask %u\n",
xt_prefix[family], match->name, hook_mask);
 return -EINVAL;
}
if (match->proto && (match->proto != proto || inv_proto)) {
 printk("%s_tables: %s match: only valid for protocol %u\n",
xt_prefix[family], match->name, match->proto);
 return -EINVAL;
}
return 0;
}

int xt_check_target(const struct xt_target *target, unsigned short family,
 unsigned int size, const char *table, unsigned int hook_mask,
 unsigned short proto, int inv_proto)
{
if (XT_ALIGN(target->targetsize) != size) {
 printk("%s_tables: %s target: invalid size %Zu != %u\n",
xt_prefix[family], target->name,
XT_ALIGN(target->targetsize), size);
 return -EINVAL;
}
if (target->table && strcmp(target->table, table)) {
 printk("%s_tables: %s target: only valid in %s table, not %s\n",
xt_prefix[family], target->name, target->table, table);
 return -EINVAL;
}
if (target->hooks && (hook_mask & ~target->hooks) != 0) {
 printk("%s_tables: %s target: bad hook_mask %u\n",
xt_prefix[family], target->name, hook_mask);
 return -EINVAL;
}
if (target->proto && (target->proto != proto || inv_proto)) {
 printk("%s_tables: %s target: only valid for protocol %u\n",
xt_prefix[family], target->name, target->proto);
 return -EINVAL;
}
return 0;
}

/* 下面是ipsec的policy检验过程处理 */

static struct xt_match policy_match = {
.name  = "policy",
.family  = AF_INET,
.match  = match,
.matchsize = sizeof(struct xt_policy_info),
.checkentry = checkentry,
.family  = AF_INET,
.me  = THIS_MODULE,
};

static int __init init(void)
{
int ret;

ret = xt_register_match(&policy_match);
if (ret)
 return ret;
ret = xt_register_match(&policy6_match);
if (ret)
 xt_unregister_match(&policy_match);
return ret;
}

static int
match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info,
 unsigned short family)
{
const struct xt_policy_elem *e;
struct sec_path *sp = skb->sp;
int strict = info->flags & XT_POLICY_MATCH_STRICT;
int i, pos;

if (sp == NULL)
 return -1;
if (strict && info->len != sp->len)
 return 0;

for (i = sp->len - 1; i >= 0; i--) {
 pos = strict ? i - sp->len + 1 : 0;
 if (pos >= info->len)
  return 0;
 e = &info->pol[pos];

 /* 检查策略 */
 if (match_xfrm_state(sp->xvec, e, family)) {
  if (!strict)
return 1;
 } else if (strict)
  return 0;
}

return strict ? 1 : 0;
}