view src/gt_web_cache.c @ 0:d39e1d0d75b6

initial add
author paulo@hit-nxdomain.opendns.com
date Sat, 20 Feb 2010 21:18:28 -0800
parents
children
line source
1 /*
2 * $Id: gt_web_cache.c,v 1.65 2006/08/06 16:53:36 hexwab Exp $
3 *
4 * Copyright (C) 2001-2003 giFT project (gift.sourceforge.net)
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
9 * later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 */
17 #include "gt_gnutella.h"
19 #include "file_cache.h"
20 #include "http_request.h"
22 #include "gt_connect.h"
23 #include "gt_node.h"
24 #include "gt_node_list.h"
25 #include "gt_netorg.h"
27 #include "gt_web_cache.h"
28 #include "gt_conf.h"
30 #include "dns.h"
32 /*****************************************************************************/
34 /* minimum time to wait before reconnecting to a webcache */
35 #define CACHE_RETRY_TIME (8 * EHOURS)
37 /*****************************************************************************/
39 /* number of times we have hit some gwebcaches */
40 static int cache_hits;
42 /* the absolute next time we will allow ourselves to access a cache */
43 static time_t next_atime;
45 /* amount of time to layoff the caches once we've received some data */
46 static time_t backoff_time = 1 * EHOURS;
48 /* holds all the caches */
49 static FileCache *web_caches;
51 /* proxy server to contact */
52 static char *gt_proxy_server;
54 /* webcaches that succeeded connecting, but returned errors or an
55 * unparseable response */
56 static FileCache *bad_caches;
58 /* whether we are in the process of checking the caches */
59 static BOOL checking_caches;
61 /*****************************************************************************/
63 static void parse_hostfile_response (HttpRequest *http_req, char *hosts_file);
64 static void parse_urlfile_response (HttpRequest *http_req, char *url_file);
66 /*****************************************************************************/
68 /* parse the extended data in the webcaches file, now its just mtime */
69 static BOOL parse_web_cache_value (char *value, time_t *r_atime)
70 {
71 time_t atime;
73 if ((atime = ATOUL (value)) == (unsigned long) -1)
74 atime = 0;
76 if (r_atime)
77 *r_atime = atime;
79 return TRUE;
80 }
82 /*****************************************************************************/
84 static char *new_webcache_url (const char *host, const char *path)
85 {
86 return stringf_dup ("http://%s/%s", host, STRING_NOTNULL(path));
87 }
89 static void ban_webcache (HttpRequest *req, const char *why)
90 {
91 char *url;
93 url = new_webcache_url (req->host, req->path);
94 GT->dbg (GT, "banning webcache %s", url);
96 file_cache_insert (bad_caches, url, why);
97 file_cache_sync (bad_caches);
99 free (url);
100 }
102 static void insert_webcache (const char *host_name, const char *remote_path,
103 time_t atime)
104 {
105 char *url;
106 char *field;
108 url = new_webcache_url (host_name, remote_path);
109 field = stringf_dup ("%lu", atime);
111 file_cache_insert (web_caches, url, field);
113 free (url);
114 free (field);
115 }
117 /*****************************************************************************/
119 static void handle_close_request (HttpRequest *req, int error_code)
120 {
121 String *s;
123 if (error_code < 0 || error_code < 200 || error_code >= 300)
124 {
125 if (error_code == -1)
126 {
127 /* the error was our fault, out of mem, etc. dont do anything */
128 GT->DBGFN (GT, "connect to server %s failed for some reason",
129 req->host);
130 }
131 else
132 {
133 char err[32];
135 snprintf (err, sizeof(err), "Received error %d", error_code);
137 /*
138 * Not found, internal server error, or too many redirects: ban
139 * the server's URL
140 */
141 GT->DBGFN (GT, "server %s returned error %i", req->host,
142 error_code);
143 ban_webcache (req, err);
144 }
145 }
147 /* TODO: this assumes this is the one hostfile request flying around,
148 * and not a urlfile request, which probably needs to be handled
149 * separately */
150 checking_caches = FALSE;
152 if ((s = req->data))
153 string_free (s);
154 }
156 static void parse_hostfile_response (HttpRequest *http_req, char *host_file)
157 {
158 int hosts = 0;
159 GtNode *node;
160 time_t now;
162 if (!host_file)
163 {
164 GT->DBGFN (GT, "empty host file from %s", http_req->host);
165 return;
166 }
168 GT->DBGFN (GT, "hostfile from server = %s", host_file);
170 now = time (NULL);
172 /*
173 * If the response start with "ERROR: " (or pseudo-html '<' char), ban the
174 * webcache.
175 */
176 if (!strncasecmp (host_file, "ERROR", sizeof ("ERROR") - 1) ||
177 host_file[0] == '<')
178 {
179 ban_webcache (http_req, "Malformed response content");
180 return;
181 }
183 while (host_file && *host_file)
184 {
185 char *host;
186 in_addr_t ip;
187 in_port_t port;
189 host = string_sep_set (&host_file, "\r\n");
191 ip = net_ip (string_sep (&host, ":"));
192 port = ATOI (host);
194 if (!port || !ip || ip == INADDR_NONE)
195 continue;
197 GT->DBGFN (GT, "registering %s:%hu (from cache %s)", net_ip_str (ip),
198 port, http_req->host);
200 /* register the hosts as ultrapeers */
201 node = gt_node_register (ip, port, GT_NODE_ULTRA);
202 hosts++;
204 if (!node)
205 continue;
207 /* set the vitality on this node to preserve it across restarts */
208 node->vitality = now;
210 /* might be connected already */
211 if (node->state != GT_NODE_DISCONNECTED)
212 continue;
214 /* try to connect to the first 5 */
215 if (hosts <= 5 && gt_conn_need_connections (GT_NODE_ULTRA))
216 gt_connect (node);
218 /* don't allow the cache to register an infinite number of hosts */
219 if (hosts >= 50)
220 break;
221 }
223 /* save the nodes we added to disk so we dont hit the caches again */
224 gt_node_list_save ();
226 /*
227 * Do an exponential backoff from the caches. If we were online and
228 * able to receive data, we should be getting node information
229 * some other way now.
230 */
231 if (hosts >= 5)
232 {
233 next_atime = now + backoff_time;
234 backoff_time *= 2;
235 }
236 }
238 static void parse_urlfile_response (HttpRequest *http_req, char *url_file)
239 {
240 int caches = 0;
242 if (!url_file)
243 {
244 GT->DBGFN (GT, "empty url file from %s", http_req->host);
245 return;
246 }
248 GT->DBGFN (GT, "urlfile from server = %s", url_file);
250 while (url_file && *url_file)
251 {
252 char *url;
253 char *host_name;
254 char *remote_path;
256 url = string_sep_set (&url_file, "\r\n");
258 /* skip past http:// */
259 string_sep (&url, "http://");
261 host_name = string_sep (&url, "/");
262 remote_path = url;
264 /* NOTE: remote_path is possibly empty */
265 if (!host_name)
266 continue;
268 url = stringf ("http://%s/%s", host_name, STRING_NOTNULL(remote_path));
270 /* if the webcache is already in our db, skip it */
271 if (file_cache_lookup (web_caches, url))
272 continue;
274 /*
275 * Only allow caches to register two more caches: this
276 * small number helps to avoid our list of caches getting
277 * polluted.
278 */
279 if (++caches > 2)
280 break;
282 /* format is: <url> <last time visited> */
283 file_cache_insert (web_caches, url, "0");
284 }
286 /* sync the pending web caches to disk */
287 file_cache_sync (web_caches);
288 }
290 static void end_request (HttpRequest *req, char *data)
291 {
292 char *str = req->request;
294 if (str && !strncmp (str, "hostfile", strlen ("hostfile")))
295 parse_hostfile_response (req, data);
296 else if (str && !strncmp (str, "urlfile", strlen ("urlfile")))
297 parse_urlfile_response (req, data);
298 else
299 abort ();
300 }
302 /*****************************************************************************/
304 /*
305 * Return TRUE if newname is in the same domain as oldname. For example,
306 * "new.gwc.example.com", "example.com", and "cache.example.com" are all
307 * considered in the same domain as "www.example.com".
308 *
309 * This is called on redirects, to make sure the cache can't redirect to an
310 * innocent site as part of a DDoS attack.
311 */
312 static BOOL in_same_domain (const char *oldname, const char *newname)
313 {
314 return FALSE;
315 #if 0
316 const char *p;
317 const char *largest = NULL;
318 int periods = 0;
320 p = newname;
322 /* get the largest common substring */
323 while (p != NULL)
324 {
325 if ((largest = strstr (oldname, p)))
326 break;
328 /* advance to next domain part */
329 p = strchr (p + 1, '.');
330 }
332 if (!largest)
333 return FALSE;
335 /*
336 * Make sure the substring matches completely to the end. This will
337 * actually fail when it shouldn't if one name includes the '.' toplevel
338 * domain and one doesn't. Oh well.
339 */
340 if (strcmp (largest, p) != 0)
341 return FALSE;
343 /*
344 * Count the number of periods to find the number of subdomains in the
345 * largest common substring.
346 */
347 for (p = largest; *p != 0; p++)
348 {
349 if (*p == '.')
350 periods++;
351 }
353 /*
354 * If the last character is the root '.', subtract one, since we are
355 * looking for the number of common subdomains, and the root is shared by
356 * all names.
357 */
358 if (largest[strlen (largest) - 1] == '.')
359 periods--;
361 /*
362 * If there are two periods, at least two toplevel domains match.
363 */
364 if (periods >= 2)
365 return TRUE;
367 /*
368 * If there is only one period shared, the names MAY be in the same
369 * domain: one of the names has to be completely contained within the
370 * other, such as the case of "foo.example.com" and "example.com".
371 */
372 if (periods == 1 &&
373 (strcmp (largest, oldname) == 0 || strcmp (largest, newname) == 0))
374 {
375 return TRUE;
376 }
378 /* not in same domain */
379 return FALSE;
380 #endif
381 }
383 /*
384 * Called to when the webcache sends a 300-level response with a provided
385 * Location: header. Have to make sure the domain the cache directs us
386 * to is the same.
387 */
388 static BOOL handle_redirect (HttpRequest *req, const char *new_host,
389 const char *new_path)
390 {
391 assert (new_host != NULL);
393 if (in_same_domain (req->host, new_host) == FALSE)
394 return FALSE;
396 /* might want to do something else if the ban list later becomes per host
397 * rather than per URL */
398 ban_webcache (req, "Redirected");
400 GT->DBGFN (GT, "Redirecting to new webcache %s/%s", new_host, new_path);
402 insert_webcache (new_host, new_path, time (NULL));
403 file_cache_sync (web_caches);
405 return TRUE;
406 }
408 /*****************************************************************************/
410 static BOOL handle_recv (HttpRequest *req, char *data, size_t len)
411 {
412 String *s;
414 /* EOF */
415 if (!data)
416 {
417 char *str = NULL;
419 if ((s = req->data))
420 str = s->str;
422 GT->DBGFN (GT, "read %s from server %s", str, req->host);
423 end_request (req, str);
425 /* clear data link */
426 req->data = NULL;
428 return TRUE;
429 }
431 if (!len)
432 return TRUE;
434 GT->DBGFN (GT, "server sent us: %s", data);
436 if (!(s = req->data) && !(s = req->data = string_new (NULL, 0, 0, TRUE)))
437 return FALSE;
439 if (string_append (s, data) != len)
440 {
441 GT->DBGFN (GT, "string append failed");
442 return FALSE;
443 }
445 return TRUE;
446 }
448 /*****************************************************************************/
450 static BOOL handle_add_headers (HttpRequest *req, Dataset **headers)
451 {
452 /* don't let intermediaries cache our request, I think */
453 dataset_insertstr (headers, "Cache-Control", "no-cache");
455 return TRUE;
456 }
458 /*****************************************************************************/
460 static BOOL parse_host_and_port (char **r_host, in_port_t *r_port)
461 {
462 char *str;
463 char *host;
464 long port;
466 str = *r_host;
468 if (r_port)
469 *r_port = 80;
471 /* skip leading 'http://' if found */
472 if (strstr (str, "http://"))
473 str += strlen ("http://");
475 host = string_sep (&str, ":");
477 if (!host)
478 return FALSE;
480 *r_host = host;
482 if (str && !string_isempty (str))
483 {
484 port = gift_strtol (str);
486 /* make sure port is valid */
487 if (port <= 0 || port >= 65536)
488 return FALSE;
490 *r_port = port;
491 }
493 return TRUE;
494 }
496 static TCPC *open_http_connection (HttpRequest *req, const char *http_name)
497 {
498 in_addr_t ip;
499 in_port_t port;
500 char *str;
501 char *name;
502 TCPC *c;
503 struct hostent *host;
505 if (!http_name)
506 return NULL;
508 if (!(str = STRDUP (http_name)))
509 return NULL;
511 name = str;
513 if (!parse_host_and_port (&name, &port))
514 {
515 GT->DBGFN (GT, "error parsing hostname \"%s\"", str);
516 free (str);
517 return NULL;
518 }
520 if (!(host = gt_dns_lookup (name)))
521 {
522 free (str);
523 return NULL;
524 }
526 /* ip is in network-order already */
527 memcpy (&ip, host->h_addr, MIN (host->h_length, sizeof (ip)));
529 if (net_match_host (ip, "LOCAL"))
530 {
531 free (str);
532 ban_webcache (req, "Resolved to local IP");
533 return NULL;
534 }
536 c = tcp_open (ip, port, FALSE);
537 if (!c)
538 {
539 GT->DBGFN (GT, "couldn't open connection to %s [%s]: %s",
540 http_name, net_ip_str (ip), GIFT_NETERROR());
541 }
543 free (str);
544 return c;
545 }
547 /* return the name we have to lookup */
548 static char *get_http_name (char *name)
549 {
550 char *proxy;
551 char *host;
553 host = name;
554 proxy = HTTP_PROXY;
556 string_trim (proxy);
558 if (proxy && !string_isempty (proxy))
559 {
560 /* connect to the proxy instead */
561 if (STRCMP (proxy, gt_proxy_server) != 0)
562 {
563 GT->DBGFN (GT, "using proxy server %s", proxy);
564 free (gt_proxy_server);
565 gt_proxy_server = STRDUP (proxy);
566 }
568 host = proxy;
569 }
571 return host;
572 }
574 static void check_dns_error (const char *name, HttpRequest *req)
575 {
576 int error;
578 error = gt_dns_get_errno ();
580 if (!error)
581 return;
583 GT->DBGFN (GT, "lookup failed on \"%s\": %s", name, gt_dns_strerror(error));
585 /* ban the host, but only if not using a proxy server */
586 if (error == HOST_NOT_FOUND && gt_proxy_server == NULL)
587 {
588 GT->DBGFN (GT, "webcache \"%s\" not in DNS. banning", name);
589 ban_webcache (req, "Host not found in DNS");
590 return;
591 }
592 }
594 static BOOL make_request (char *host_name, char *remote_path, char *request)
595 {
596 HttpRequest *req;
597 TCPC *c;
598 char *resolve_name;
599 char *url;
601 url = stringf_dup ("http://%s/%s", host_name, STRING_NOTNULL(remote_path));
603 if (!(req = gt_http_request_new (url, request)))
604 {
605 free (url);
606 return FALSE;
607 }
609 free (url);
611 resolve_name = get_http_name (host_name);
613 gt_dns_set_errno (0);
615 if (!(c = open_http_connection (req, resolve_name)))
616 {
617 check_dns_error (resolve_name, req);
618 gt_http_request_close (req, -1);
619 return FALSE;
620 }
622 GT->DBGFN (GT, "opening connection to %s [%s]",
623 resolve_name, net_ip_str (c->host));
625 req->recv_func = handle_recv;
626 req->add_header_func = handle_add_headers;
627 req->close_req_func = handle_close_request;
628 req->redirect_func = handle_redirect;
630 gt_http_request_set_conn (req, c); /* setup references */
631 gt_http_request_set_proxy (req, gt_proxy_server); /* maybe use proxy */
632 gt_http_request_set_timeout (req, 2 * MINUTES); /* don't wait forever */
633 gt_http_request_set_max_len (req, 65536); /* don't read forever */
635 input_add (c->fd, c, INPUT_WRITE,
636 (InputCallback)gt_http_request_handle, TIMEOUT_DEF);
638 return TRUE;
639 }
641 /*****************************************************************************/
643 struct find_rand_args
644 {
645 int n;
646 time_t now;
647 char *url;
648 char *field;
649 };
651 /* get a random cache from the webcaches dataset */
652 static void foreach_rand_cache (ds_data_t *key, ds_data_t *value,
653 struct find_rand_args *args)
654 {
655 time_t atime;
656 float range = args->n;
657 char *str;
658 char *url = key->data;
659 char *hostname, *path;
660 int ret;
662 if (!parse_web_cache_value (value->data, &atime))
663 return;
665 /* skip the cache entirely if we've retried too soon */
666 if (args->now - atime < CACHE_RETRY_TIME)
667 return;
669 /*
670 * Make sure the cache has a parseable url
671 *
672 * TODO: This is ugly, it really should be parsed into a
673 * a data structure once instead.
674 */
675 str = STRDUP (url);
676 ret = gt_http_url_parse (str, &hostname, &path);
677 free (str);
679 if (!ret)
680 {
681 GT->warn (GT, "bad webcache url \"%s\" from %s/gwebcaches",
682 key->data, gift_conf_path ("Gnutella"));
683 return;
684 }
686 /* decrease probability of selecting the next web cache */
687 args->n++;
689 /*
690 * Select this webcache with probability 1/n.
691 *
692 * Also select this cache if we haven't chosen one yet, which may be the
693 * case on if the index of the cache is > 0 when there are banned caches.
694 */
695 if (args->url == NULL ||
696 range * rand() / (RAND_MAX + 1.0) < 1.0)
697 {
698 char *keystr = key->data;
699 char *valuestr = value->data;
701 /* check if this is a bad gwebcache */
702 if (file_cache_lookup (bad_caches, url))
703 {
704 #if 1
705 GT->warn (GT, "skipping webcache %s, in bad gwebcaches", url);
706 #endif
707 /* pretend we didn't select this to ensure equal distribution */
708 args->n--;
710 return;
711 }
713 /* free the old values */
714 free (args->url);
715 free (args->field);
717 args->url = STRDUP (keystr);
718 args->field = STRDUP (valuestr);
719 }
720 }
722 static BOOL get_random_cache (time_t now, char **r_host_name,
723 char **r_remote_path)
724 {
725 int ret;
726 struct find_rand_args args;
728 args.n = 1; /* initial probability */
729 args.now = now; /* current time */
730 args.url = NULL;
731 args.field = NULL;
733 dataset_foreach (web_caches->d, DS_FOREACH(foreach_rand_cache), &args);
735 if (!args.url)
736 {
737 GT->DBGFN (GT, "couldn't find random cache");
738 return FALSE;
739 }
741 ret = gt_http_url_parse (args.url, r_host_name, r_remote_path);
743 if (!*r_host_name || !*r_remote_path)
744 {
745 free (args.url);
746 free (args.field);
747 return FALSE;
748 }
750 *r_host_name = STRDUP (*r_host_name);
751 *r_remote_path = STRDUP (*r_remote_path);
753 /* free the original buffer */
754 free (args.url);
755 free (args.field);
757 return ret;
758 }
760 static void access_gwebcaches (void)
761 {
762 int len;
763 char *host_name;
764 char *remote_path;
765 time_t now;
766 int host_requests = 0;
767 #if 0
768 int url_requests = 0;
769 #endif
770 int max_requests = 1;
771 BOOL ret;
772 BOOL need_sync;
774 /*
775 * We may get called while a check of the gwebcaches is already
776 * in progress.
777 */
778 if (checking_caches)
779 {
780 GT->DBGFN (GT, "Access already in progress");
781 return;
782 }
784 now = time (NULL);
786 len = dataset_length (web_caches->d);
788 if (max_requests > len)
789 max_requests = len;
791 need_sync = FALSE;
793 while (host_requests < max_requests)
794 {
795 if (!get_random_cache (now, &host_name, &remote_path))
796 {
797 GT->DBGFN (GT, "error looking up cache");
798 break;
799 }
801 #if 0
802 /* make a url request sometimes to keep the cache file up to date, but
803 * mostly ask for hosts */
804 if (10.0 * rand() / (RAND_MAX + 1.0) < 1.0)
805 {
806 ret = make_request (host_name, remote_path,
807 "urlfile=1&client=GIFT&version=" GT_VERSION);
808 url_requests++;
809 }
810 else
811 #endif
812 {
813 ret = make_request (host_name, remote_path,
814 "hostfile=1&client=GIFT&version=" GT_VERSION);
816 if (ret)
817 checking_caches = TRUE;
819 host_requests++;
820 }
822 if (ret)
823 {
824 GT->DBGFN (GT, "hitting web cache [total cache hits %u] "
825 "(cache: http://%s/%s)", cache_hits,
826 host_name, STRING_NOTNULL(remote_path));
828 cache_hits++;
829 need_sync = TRUE;
831 /* reset the atime for the cache */
832 insert_webcache (host_name, remote_path, now);
833 }
835 free (host_name);
836 free (remote_path);
837 }
839 /* only sync when we successfully accessed a cache */
840 if (need_sync)
841 file_cache_sync (web_caches);
842 }
844 static BOOL webcache_update (void *udata)
845 {
846 char *webcache_file;
847 int web_exists;
848 time_t now;
849 size_t nodes_len;
850 struct stat st;
852 if (GNUTELLA_LOCAL_MODE)
853 return TRUE;
855 now = time (NULL);
856 nodes_len = gt_conn_length (GT_NODE_NONE, GT_NODE_ANY);
858 /*
859 * If we've already accessed the caches successfully, we won't
860 * allow another access to go through, _unless_ the node list
861 * is small enough, in which case it could be we really do need
862 * to access the caches.
863 */
864 if (now < next_atime && nodes_len >= 20)
865 return FALSE;
867 webcache_file = STRDUP (gift_conf_path ("Gnutella/gwebcaches"));
868 web_exists = file_stat (webcache_file, &st);
870 if (!web_exists)
871 {
872 GIFT_ERROR (("gwebcaches file doesn't exist"));
873 return FALSE;
874 }
876 /*
877 * next_atime, the absolute next time we allow ourselves to contact the
878 * caches, gets set when we sucessfully access the caches, and if we
879 * manage to get some hosts from a cache we access in an exponentially
880 * decreasing interval.
881 */
882 access_gwebcaches ();
884 free (webcache_file);
885 return TRUE;
886 }
888 /*****************************************************************************/
890 void gt_web_cache_update (void)
891 {
892 webcache_update (NULL);
893 }
895 BOOL gt_web_cache_init (void)
896 {
897 /*
898 * Copy the gwebcaches file to from the data dir to
899 * ~/.giFT/Gnutella if it is newer or if ~/.giFT/Gnutella/gwebcaches
900 * doesn't exist.
901 */
902 gt_config_load_file ("Gnutella/gwebcaches", TRUE, FALSE);
904 web_caches = file_cache_new (gift_conf_path ("Gnutella/gwebcaches"));
905 bad_caches = file_cache_new (gift_conf_path ("Gnutella/bad_gwebcaches"));
907 if (!web_caches)
908 return FALSE;
910 return TRUE;
911 }
913 void gt_web_cache_cleanup (void)
914 {
915 file_cache_free (web_caches);
916 web_caches = NULL;
918 file_cache_free (bad_caches);
919 bad_caches = NULL;
921 cache_hits = 0;
922 next_atime = 0;
924 checking_caches = FALSE;
925 }