Linux kernel & device driver programming

Cross-Referenced Linux and Device Driver Code

[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]
Version: [ 2.6.11.8 ] [ 2.6.25 ] [ 2.6.25.8 ] [ 2.6.31.13 ] Architecture: [ i386 ]
  1 /* NAT for netfilter; shared with compatibility layer. */
  2 
  3 /* (C) 1999-2001 Paul `Rusty' Russell
  4  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
  5  *
  6  * This program is free software; you can redistribute it and/or modify
  7  * it under the terms of the GNU General Public License version 2 as
  8  * published by the Free Software Foundation.
  9  */
 10 
 11 #include <linux/module.h>
 12 #include <linux/types.h>
 13 #include <linux/timer.h>
 14 #include <linux/skbuff.h>
 15 #include <linux/netfilter_ipv4.h>
 16 #include <linux/vmalloc.h>
 17 #include <net/checksum.h>
 18 #include <net/icmp.h>
 19 #include <net/ip.h>
 20 #include <net/tcp.h>  /* For tcp_prot in getorigdst */
 21 #include <linux/icmp.h>
 22 #include <linux/udp.h>
 23 #include <linux/jhash.h>
 24 
 25 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
 26 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
 27 
 28 #include <linux/netfilter_ipv4/ip_conntrack.h>
 29 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
 30 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
 31 #include <linux/netfilter_ipv4/ip_nat.h>
 32 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
 33 #include <linux/netfilter_ipv4/ip_nat_core.h>
 34 #include <linux/netfilter_ipv4/ip_nat_helper.h>
 35 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 36 #include <linux/netfilter_ipv4/listhelp.h>
 37 
 38 #if 0
 39 #define DEBUGP printk
 40 #else
 41 #define DEBUGP(format, args...)
 42 #endif
 43 
 44 DECLARE_RWLOCK(ip_nat_lock);
 45 
 46 /* Calculated at init based on memory size */
 47 static unsigned int ip_nat_htable_size;
 48 
 49 static struct list_head *bysource;
 50 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
 51 
 52 
 53 /* We keep an extra hash for each conntrack, for fast searching. */
 54 static inline unsigned int
 55 hash_by_src(const struct ip_conntrack_tuple *tuple)
 56 {
 57         /* Original src, to ensure we map it consistently if poss. */
 58         return jhash_3words(tuple->src.ip, tuple->src.u.all,
 59                             tuple->dst.protonum, 0) % ip_nat_htable_size;
 60 }
 61 
 62 /* Noone using conntrack by the time this called. */
 63 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
 64 {
 65         if (!(conn->status & IPS_NAT_DONE_MASK))
 66                 return;
 67 
 68         WRITE_LOCK(&ip_nat_lock);
 69         list_del(&conn->nat.info.bysource);
 70         WRITE_UNLOCK(&ip_nat_lock);
 71 }
 72 
 73 /* We do checksum mangling, so if they were wrong before they're still
 74  * wrong.  Also works for incomplete packets (eg. ICMP dest
 75  * unreachables.) */
 76 u_int16_t
 77 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
 78 {
 79         u_int32_t diffs[] = { oldvalinv, newval };
 80         return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
 81                                       oldcheck^0xFFFF));
 82 }
 83 
 84 /* Is this tuple already taken? (not by us) */
 85 int
 86 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
 87                   const struct ip_conntrack *ignored_conntrack)
 88 {
 89         /* Conntrack tracking doesn't keep track of outgoing tuples; only
 90            incoming ones.  NAT means they don't have a fixed mapping,
 91            so we invert the tuple and look for the incoming reply.
 92 
 93            We could keep a separate hash if this proves too slow. */
 94         struct ip_conntrack_tuple reply;
 95 
 96         invert_tuplepr(&reply, tuple);
 97         return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
 98 }
 99 
100 /* If we source map this tuple so reply looks like reply_tuple, will
101  * that meet the constraints of range. */
102 static int
103 in_range(const struct ip_conntrack_tuple *tuple,
104          const struct ip_nat_range *range)
105 {
106         struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
107 
108         /* If we are supposed to map IPs, then we must be in the
109            range specified, otherwise let this drag us onto a new src IP. */
110         if (range->flags & IP_NAT_RANGE_MAP_IPS) {
111                 if (ntohl(tuple->src.ip) < ntohl(range->min_ip)
112                     || ntohl(tuple->src.ip) > ntohl(range->max_ip))
113                         return 0;
114         }
115 
116         if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
117             || proto->in_range(tuple, IP_NAT_MANIP_SRC,
118                                &range->min, &range->max))
119                 return 1;
120 
121         return 0;
122 }
123 
124 static inline int
125 same_src(const struct ip_conntrack *ct,
126          const struct ip_conntrack_tuple *tuple)
127 {
128         return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
129                 == tuple->dst.protonum
130                 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
131                 == tuple->src.ip
132                 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
133                 == tuple->src.u.all);
134 }
135 
136 /* Only called for SRC manip */
137 static int
138 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
139                      struct ip_conntrack_tuple *result,
140                      const struct ip_nat_range *range)
141 {
142         unsigned int h = hash_by_src(tuple);
143         struct ip_conntrack *ct;
144 
145         READ_LOCK(&ip_nat_lock);
146         list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
147                 if (same_src(ct, tuple)) {
148                         /* Copy source part from reply tuple. */
149                         invert_tuplepr(result,
150                                        &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
151                         result->dst = tuple->dst;
152 
153                         if (in_range(result, range)) {
154                                 READ_UNLOCK(&ip_nat_lock);
155                                 return 1;
156                         }
157                 }
158         }
159         READ_UNLOCK(&ip_nat_lock);
160         return 0;
161 }
162 
163 /* For [FUTURE] fragmentation handling, we want the least-used
164    src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
165    if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
166    1-65535, we don't do pro-rata allocation based on ports; we choose
167    the ip with the lowest src-ip/dst-ip/proto usage.
168 */
169 static void
170 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
171                     const struct ip_nat_range *range,
172                     const struct ip_conntrack *conntrack,
173                     enum ip_nat_manip_type maniptype)
174 {
175         u_int32_t *var_ipp;
176         /* Host order */
177         u_int32_t minip, maxip, j;
178 
179         /* No IP mapping?  Do nothing. */
180         if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
181                 return;
182 
183         if (maniptype == IP_NAT_MANIP_SRC)
184                 var_ipp = &tuple->src.ip;
185         else
186                 var_ipp = &tuple->dst.ip;
187 
188         /* Fast path: only one choice. */
189         if (range->min_ip == range->max_ip) {
190                 *var_ipp = range->min_ip;
191                 return;
192         }
193 
194         /* Hashing source and destination IPs gives a fairly even
195          * spread in practice (if there are a small number of IPs
196          * involved, there usually aren't that many connections
197          * anyway).  The consistency means that servers see the same
198          * client coming from the same IP (some Internet Banking sites
199          * like this), even across reboots. */
200         minip = ntohl(range->min_ip);
201         maxip = ntohl(range->max_ip);
202         j = jhash_2words(tuple->src.ip, tuple->dst.ip, 0);
203         *var_ipp = htonl(minip + j % (maxip - minip + 1));
204 }
205 
206 /* Manipulate the tuple into the range given.  For NF_IP_POST_ROUTING,
207  * we change the source to map into the range.  For NF_IP_PRE_ROUTING
208  * and NF_IP_LOCAL_OUT, we change the destination to map into the
209  * range.  It might not be possible to get a unique tuple, but we try.
210  * At worst (or if we race), we will end up with a final duplicate in
211  * __ip_conntrack_confirm and drop the packet. */
212 static void
213 get_unique_tuple(struct ip_conntrack_tuple *tuple,
214                  const struct ip_conntrack_tuple *orig_tuple,
215                  const struct ip_nat_range *range,
216                  struct ip_conntrack *conntrack,
217                  enum ip_nat_manip_type maniptype)
218 {
219         struct ip_nat_protocol *proto
220                 = ip_nat_find_proto(orig_tuple->dst.protonum);
221 
222         /* 1) If this srcip/proto/src-proto-part is currently mapped,
223            and that same mapping gives a unique tuple within the given
224            range, use that.
225 
226            This is only required for source (ie. NAT/masq) mappings.
227            So far, we don't do local source mappings, so multiple
228            manips not an issue.  */
229         if (maniptype == IP_NAT_MANIP_SRC) {
230                 if (find_appropriate_src(orig_tuple, tuple, range)) {
231                         DEBUGP("get_unique_tuple: Found current src map\n");
232                         if (!ip_nat_used_tuple(tuple, conntrack))
233                                 return;
234                 }
235         }
236 
237         /* 2) Select the least-used IP/proto combination in the given
238            range. */
239         *tuple = *orig_tuple;
240         find_best_ips_proto(tuple, range, conntrack, maniptype);
241 
242         /* 3) The per-protocol part of the manip is made to map into
243            the range to make a unique tuple. */
244 
245         /* Only bother mapping if it's not already in range and unique */
246         if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
247              || proto->in_range(tuple, maniptype, &range->min, &range->max))
248             && !ip_nat_used_tuple(tuple, conntrack))
249                 return;
250 
251         /* Last change: get protocol to try to obtain unique tuple. */
252         proto->unique_tuple(tuple, range, maniptype, conntrack);
253 }
254 
255 unsigned int
256 ip_nat_setup_info(struct ip_conntrack *conntrack,
257                   const struct ip_nat_range *range,
258                   unsigned int hooknum)
259 {
260         struct ip_conntrack_tuple curr_tuple, new_tuple;
261         struct ip_nat_info *info = &conntrack->nat.info;
262         int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK);
263         enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
264 
265         IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
266                      || hooknum == NF_IP_POST_ROUTING
267                      || hooknum == NF_IP_LOCAL_IN
268                      || hooknum == NF_IP_LOCAL_OUT);
269         BUG_ON(ip_nat_initialized(conntrack, maniptype));
270 
271         /* What we've got will look like inverse of reply. Normally
272            this is what is in the conntrack, except for prior
273            manipulations (future optimization: if num_manips == 0,
274            orig_tp =
275            conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
276         invert_tuplepr(&curr_tuple,
277                        &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
278 
279         get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype);
280 
281         if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) {
282                 struct ip_conntrack_tuple reply;
283 
284                 /* Alter conntrack table so will recognize replies. */
285                 invert_tuplepr(&reply, &new_tuple);
286                 ip_conntrack_alter_reply(conntrack, &reply);
287 
288                 /* Non-atomic: we own this at the moment. */
289                 if (maniptype == IP_NAT_MANIP_SRC)
290                         conntrack->status |= IPS_SRC_NAT;
291                 else
292                         conntrack->status |= IPS_DST_NAT;
293         }
294 
295         /* Place in source hash if this is the first time. */
296         if (have_to_hash) {
297                 unsigned int srchash
298                         = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
299                                       .tuple);
300                 WRITE_LOCK(&ip_nat_lock);
301                 list_add(&info->bysource, &bysource[srchash]);
302                 WRITE_UNLOCK(&ip_nat_lock);
303         }
304 
305         /* It's done. */
306         if (maniptype == IP_NAT_MANIP_DST)
307                 set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status);
308         else
309                 set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status);
310 
311         return NF_ACCEPT;
312 }
313 
314 /* Returns true if succeeded. */
315 static int
316 manip_pkt(u_int16_t proto,
317           struct sk_buff **pskb,
318           unsigned int iphdroff,
319           const struct ip_conntrack_tuple *target,
320           enum ip_nat_manip_type maniptype)
321 {
322         struct iphdr *iph;
323 
324         (*pskb)->nfcache |= NFC_ALTERED;
325         if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
326                 return 0;
327 
328         iph = (void *)(*pskb)->data + iphdroff;
329 
330         /* Manipulate protcol part. */
331         if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff,
332                                                  target, maniptype))
333                 return 0;
334 
335         iph = (void *)(*pskb)->data + iphdroff;
336 
337         if (maniptype == IP_NAT_MANIP_SRC) {
338                 iph->check = ip_nat_cheat_check(~iph->saddr, target->src.ip,
339                                                 iph->check);
340                 iph->saddr = target->src.ip;
341         } else {
342                 iph->check = ip_nat_cheat_check(~iph->daddr, target->dst.ip,
343                                                 iph->check);
344                 iph->daddr = target->dst.ip;
345         }
346         return 1;
347 }
348 
349 /* Do packet manipulations according to ip_nat_setup_info. */
350 unsigned int nat_packet(struct ip_conntrack *ct,
351                         enum ip_conntrack_info ctinfo,
352                         unsigned int hooknum,
353                         struct sk_buff **pskb)
354 {
355         enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
356         unsigned long statusbit;
357         enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum);
358 
359         if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)
360             && (hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_IN)) {
361                 DEBUGP("ip_nat_core: adjusting sequence number\n");
362                 /* future: put this in a l4-proto specific function,
363                  * and call this function here. */
364                 if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
365                         return NF_DROP;
366         }
367 
368         if (mtype == IP_NAT_MANIP_SRC)
369                 statusbit = IPS_SRC_NAT;
370         else
371                 statusbit = IPS_DST_NAT;
372 
373         /* Invert if this is reply dir. */
374         if (dir == IP_CT_DIR_REPLY)
375                 statusbit ^= IPS_NAT_MASK;
376 
377         /* Non-atomic: these bits don't change. */
378         if (ct->status & statusbit) {
379                 struct ip_conntrack_tuple target;
380 
381                 /* We are aiming to look like inverse of other direction. */
382                 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
383 
384                 if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype))
385                         return NF_DROP;
386         }
387         return NF_ACCEPT;
388 }
389 
390 /* Dir is direction ICMP is coming from (opposite to packet it contains) */
391 int icmp_reply_translation(struct sk_buff **pskb,
392                            struct ip_conntrack *ct,
393                            enum ip_nat_manip_type manip,
394                            enum ip_conntrack_dir dir)
395 {
396         struct {
397                 struct icmphdr icmp;
398                 struct iphdr ip;
399         } *inside;
400         struct ip_conntrack_tuple inner, target;
401         int hdrlen = (*pskb)->nh.iph->ihl * 4;
402 
403         if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside)))
404                 return 0;
405 
406         inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
407 
408         /* We're actually going to mangle it beyond trivial checksum
409            adjustment, so make sure the current checksum is correct. */
410         if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
411                 hdrlen = (*pskb)->nh.iph->ihl * 4;
412                 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
413                                                 (*pskb)->len - hdrlen, 0)))
414                         return 0;
415         }
416 
417         /* Must be RELATED */
418         IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
419                      (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
420 
421         /* Redirects on non-null nats must be dropped, else they'll
422            start talking to each other without our translation, and be
423            confused... --RR */
424         if (inside->icmp.type == ICMP_REDIRECT) {
425                 /* If NAT isn't finished, assume it and drop. */
426                 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
427                         return 0;
428 
429                 if (ct->status & IPS_NAT_MASK)
430                         return 0;
431         }
432 
433         DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n",
434                *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
435 
436         if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
437                              sizeof(struct icmphdr) + inside->ip.ihl*4,
438                              &inner, ip_ct_find_proto(inside->ip.protocol)))
439                 return 0;
440 
441         /* Change inner back to look like incoming packet.  We do the
442            opposite manip on this hook to normal, because it might not
443            pass all hooks (locally-generated ICMP).  Consider incoming
444            packet: PREROUTING (DST manip), routing produces ICMP, goes
445            through POSTROUTING (which must correct the DST manip). */
446         if (!manip_pkt(inside->ip.protocol, pskb,
447                        (*pskb)->nh.iph->ihl*4
448                        + sizeof(inside->icmp),
449                        &ct->tuplehash[!dir].tuple,
450                        !manip))
451                 return 0;
452 
453         /* Reloading "inside" here since manip_pkt inner. */
454         inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
455         inside->icmp.checksum = 0;
456         inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
457                                                        (*pskb)->len - hdrlen,
458                                                        0));
459 
460         /* Change outer to look the reply to an incoming packet
461          * (proto 0 means don't invert per-proto part). */
462 
463         /* Obviously, we need to NAT destination IP, but source IP
464            should be NAT'ed only if it is from a NAT'd host.
465 
466            Explanation: some people use NAT for anonymizing.  Also,
467            CERT recommends dropping all packets from private IP
468            addresses (although ICMP errors from internal links with
469            such addresses are not too uncommon, as Alan Cox points
470            out) */
471         if (manip != IP_NAT_MANIP_SRC
472             || ((*pskb)->nh.iph->saddr == ct->tuplehash[dir].tuple.src.ip)) {
473                 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
474                 if (!manip_pkt(0, pskb, 0, &target, manip))
475                         return 0;
476         }
477 
478         return 1;
479 }
480 
481 /* Protocol registration. */
482 int ip_nat_protocol_register(struct ip_nat_protocol *proto)
483 {
484         int ret = 0;
485 
486         WRITE_LOCK(&ip_nat_lock);
487         if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
488                 ret = -EBUSY;
489                 goto out;
490         }
491         ip_nat_protos[proto->protonum] = proto;
492  out:
493         WRITE_UNLOCK(&ip_nat_lock);
494         return ret;
495 }
496 
497 /* Noone stores the protocol anywhere; simply delete it. */
498 void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
499 {
500         WRITE_LOCK(&ip_nat_lock);
501         ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
502         WRITE_UNLOCK(&ip_nat_lock);
503 
504         /* Someone could be still looking at the proto in a bh. */
505         synchronize_net();
506 }
507 
508 int __init ip_nat_init(void)
509 {
510         size_t i;
511 
512         /* Leave them the same for the moment. */
513         ip_nat_htable_size = ip_conntrack_htable_size;
514 
515         /* One vmalloc for both hash tables */
516         bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size);
517         if (!bysource)
518                 return -ENOMEM;
519 
520         /* Sew in builtin protocols. */
521         WRITE_LOCK(&ip_nat_lock);
522         for (i = 0; i < MAX_IP_NAT_PROTO; i++)
523                 ip_nat_protos[i] = &ip_nat_unknown_protocol;
524         ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
525         ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
526         ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
527         WRITE_UNLOCK(&ip_nat_lock);
528 
529         for (i = 0; i < ip_nat_htable_size; i++) {
530                 INIT_LIST_HEAD(&bysource[i]);
531         }
532 
533         /* FIXME: Man, this is a hack.  <SIGH> */
534         IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
535         ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
536 
537         /* Initialize fake conntrack so that NAT will skip it */
538         ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
539         return 0;
540 }
541 
542 /* Clear NAT section of all conntracks, in case we're loaded again. */
543 static int clean_nat(struct ip_conntrack *i, void *data)
544 {
545         memset(&i->nat, 0, sizeof(i->nat));
546         i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
547         return 0;
548 }
549 
550 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
551 void ip_nat_cleanup(void)
552 {
553         ip_ct_iterate_cleanup(&clean_nat, NULL);
554         ip_conntrack_destroyed = NULL;
555         vfree(bysource);
556 }
557 
  This page was automatically generated by the LXR engine.