Mercurial > hg > index.fcgi > gift-gnutella > gift-gnutella-0.0.11-1pba
diff src/gt_web_cache.c @ 0:d39e1d0d75b6
initial add
author | paulo@hit-nxdomain.opendns.com |
---|---|
date | Sat, 20 Feb 2010 21:18:28 -0800 |
parents | |
children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/gt_web_cache.c Sat Feb 20 21:18:28 2010 -0800 1.3 @@ -0,0 +1,925 @@ 1.4 +/* 1.5 + * $Id: gt_web_cache.c,v 1.65 2006/08/06 16:53:36 hexwab Exp $ 1.6 + * 1.7 + * Copyright (C) 2001-2003 giFT project (gift.sourceforge.net) 1.8 + * 1.9 + * This program is free software; you can redistribute it and/or modify it 1.10 + * under the terms of the GNU General Public License as published by the 1.11 + * Free Software Foundation; either version 2, or (at your option) any 1.12 + * later version. 1.13 + * 1.14 + * This program is distributed in the hope that it will be useful, but 1.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of 1.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1.17 + * General Public License for more details. 1.18 + */ 1.19 + 1.20 +#include "gt_gnutella.h" 1.21 + 1.22 +#include "file_cache.h" 1.23 +#include "http_request.h" 1.24 + 1.25 +#include "gt_connect.h" 1.26 +#include "gt_node.h" 1.27 +#include "gt_node_list.h" 1.28 +#include "gt_netorg.h" 1.29 + 1.30 +#include "gt_web_cache.h" 1.31 +#include "gt_conf.h" 1.32 + 1.33 +#include "dns.h" 1.34 + 1.35 +/*****************************************************************************/ 1.36 + 1.37 +/* minimum time to wait before reconnecting to a webcache */ 1.38 +#define CACHE_RETRY_TIME (8 * EHOURS) 1.39 + 1.40 +/*****************************************************************************/ 1.41 + 1.42 +/* number of times we have hit some gwebcaches */ 1.43 +static int cache_hits; 1.44 + 1.45 +/* the absolute next time we will allow ourselves to access a cache */ 1.46 +static time_t next_atime; 1.47 + 1.48 +/* amount of time to layoff the caches once we've received some data */ 1.49 +static time_t backoff_time = 1 * EHOURS; 1.50 + 1.51 +/* holds all the caches */ 1.52 +static FileCache *web_caches; 1.53 + 1.54 +/* proxy server to contact */ 1.55 +static char *gt_proxy_server; 1.56 + 1.57 +/* webcaches that succeeded connecting, but returned errors or an 1.58 + * unparseable response */ 1.59 +static FileCache *bad_caches; 1.60 + 1.61 +/* whether we are in the process of checking the caches */ 1.62 +static BOOL checking_caches; 1.63 + 1.64 +/*****************************************************************************/ 1.65 + 1.66 +static void parse_hostfile_response (HttpRequest *http_req, char *hosts_file); 1.67 +static void parse_urlfile_response (HttpRequest *http_req, char *url_file); 1.68 + 1.69 +/*****************************************************************************/ 1.70 + 1.71 +/* parse the extended data in the webcaches file, now its just mtime */ 1.72 +static BOOL parse_web_cache_value (char *value, time_t *r_atime) 1.73 +{ 1.74 + time_t atime; 1.75 + 1.76 + if ((atime = ATOUL (value)) == (unsigned long) -1) 1.77 + atime = 0; 1.78 + 1.79 + if (r_atime) 1.80 + *r_atime = atime; 1.81 + 1.82 + return TRUE; 1.83 +} 1.84 + 1.85 +/*****************************************************************************/ 1.86 + 1.87 +static char *new_webcache_url (const char *host, const char *path) 1.88 +{ 1.89 + return stringf_dup ("http://%s/%s", host, STRING_NOTNULL(path)); 1.90 +} 1.91 + 1.92 +static void ban_webcache (HttpRequest *req, const char *why) 1.93 +{ 1.94 + char *url; 1.95 + 1.96 + url = new_webcache_url (req->host, req->path); 1.97 + GT->dbg (GT, "banning webcache %s", url); 1.98 + 1.99 + file_cache_insert (bad_caches, url, why); 1.100 + file_cache_sync (bad_caches); 1.101 + 1.102 + free (url); 1.103 +} 1.104 + 1.105 +static void insert_webcache (const char *host_name, const char *remote_path, 1.106 + time_t atime) 1.107 +{ 1.108 + char *url; 1.109 + char *field; 1.110 + 1.111 + url = new_webcache_url (host_name, remote_path); 1.112 + field = stringf_dup ("%lu", atime); 1.113 + 1.114 + file_cache_insert (web_caches, url, field); 1.115 + 1.116 + free (url); 1.117 + free (field); 1.118 +} 1.119 + 1.120 +/*****************************************************************************/ 1.121 + 1.122 +static void handle_close_request (HttpRequest *req, int error_code) 1.123 +{ 1.124 + String *s; 1.125 + 1.126 + if (error_code < 0 || error_code < 200 || error_code >= 300) 1.127 + { 1.128 + if (error_code == -1) 1.129 + { 1.130 + /* the error was our fault, out of mem, etc. dont do anything */ 1.131 + GT->DBGFN (GT, "connect to server %s failed for some reason", 1.132 + req->host); 1.133 + } 1.134 + else 1.135 + { 1.136 + char err[32]; 1.137 + 1.138 + snprintf (err, sizeof(err), "Received error %d", error_code); 1.139 + 1.140 + /* 1.141 + * Not found, internal server error, or too many redirects: ban 1.142 + * the server's URL 1.143 + */ 1.144 + GT->DBGFN (GT, "server %s returned error %i", req->host, 1.145 + error_code); 1.146 + ban_webcache (req, err); 1.147 + } 1.148 + } 1.149 + 1.150 + /* TODO: this assumes this is the one hostfile request flying around, 1.151 + * and not a urlfile request, which probably needs to be handled 1.152 + * separately */ 1.153 + checking_caches = FALSE; 1.154 + 1.155 + if ((s = req->data)) 1.156 + string_free (s); 1.157 +} 1.158 + 1.159 +static void parse_hostfile_response (HttpRequest *http_req, char *host_file) 1.160 +{ 1.161 + int hosts = 0; 1.162 + GtNode *node; 1.163 + time_t now; 1.164 + 1.165 + if (!host_file) 1.166 + { 1.167 + GT->DBGFN (GT, "empty host file from %s", http_req->host); 1.168 + return; 1.169 + } 1.170 + 1.171 + GT->DBGFN (GT, "hostfile from server = %s", host_file); 1.172 + 1.173 + now = time (NULL); 1.174 + 1.175 + /* 1.176 + * If the response start with "ERROR: " (or pseudo-html '<' char), ban the 1.177 + * webcache. 1.178 + */ 1.179 + if (!strncasecmp (host_file, "ERROR", sizeof ("ERROR") - 1) || 1.180 + host_file[0] == '<') 1.181 + { 1.182 + ban_webcache (http_req, "Malformed response content"); 1.183 + return; 1.184 + } 1.185 + 1.186 + while (host_file && *host_file) 1.187 + { 1.188 + char *host; 1.189 + in_addr_t ip; 1.190 + in_port_t port; 1.191 + 1.192 + host = string_sep_set (&host_file, "\r\n"); 1.193 + 1.194 + ip = net_ip (string_sep (&host, ":")); 1.195 + port = ATOI (host); 1.196 + 1.197 + if (!port || !ip || ip == INADDR_NONE) 1.198 + continue; 1.199 + 1.200 + GT->DBGFN (GT, "registering %s:%hu (from cache %s)", net_ip_str (ip), 1.201 + port, http_req->host); 1.202 + 1.203 + /* register the hosts as ultrapeers */ 1.204 + node = gt_node_register (ip, port, GT_NODE_ULTRA); 1.205 + hosts++; 1.206 + 1.207 + if (!node) 1.208 + continue; 1.209 + 1.210 + /* set the vitality on this node to preserve it across restarts */ 1.211 + node->vitality = now; 1.212 + 1.213 + /* might be connected already */ 1.214 + if (node->state != GT_NODE_DISCONNECTED) 1.215 + continue; 1.216 + 1.217 + /* try to connect to the first 5 */ 1.218 + if (hosts <= 5 && gt_conn_need_connections (GT_NODE_ULTRA)) 1.219 + gt_connect (node); 1.220 + 1.221 + /* don't allow the cache to register an infinite number of hosts */ 1.222 + if (hosts >= 50) 1.223 + break; 1.224 + } 1.225 + 1.226 + /* save the nodes we added to disk so we dont hit the caches again */ 1.227 + gt_node_list_save (); 1.228 + 1.229 + /* 1.230 + * Do an exponential backoff from the caches. If we were online and 1.231 + * able to receive data, we should be getting node information 1.232 + * some other way now. 1.233 + */ 1.234 + if (hosts >= 5) 1.235 + { 1.236 + next_atime = now + backoff_time; 1.237 + backoff_time *= 2; 1.238 + } 1.239 +} 1.240 + 1.241 +static void parse_urlfile_response (HttpRequest *http_req, char *url_file) 1.242 +{ 1.243 + int caches = 0; 1.244 + 1.245 + if (!url_file) 1.246 + { 1.247 + GT->DBGFN (GT, "empty url file from %s", http_req->host); 1.248 + return; 1.249 + } 1.250 + 1.251 + GT->DBGFN (GT, "urlfile from server = %s", url_file); 1.252 + 1.253 + while (url_file && *url_file) 1.254 + { 1.255 + char *url; 1.256 + char *host_name; 1.257 + char *remote_path; 1.258 + 1.259 + url = string_sep_set (&url_file, "\r\n"); 1.260 + 1.261 + /* skip past http:// */ 1.262 + string_sep (&url, "http://"); 1.263 + 1.264 + host_name = string_sep (&url, "/"); 1.265 + remote_path = url; 1.266 + 1.267 + /* NOTE: remote_path is possibly empty */ 1.268 + if (!host_name) 1.269 + continue; 1.270 + 1.271 + url = stringf ("http://%s/%s", host_name, STRING_NOTNULL(remote_path)); 1.272 + 1.273 + /* if the webcache is already in our db, skip it */ 1.274 + if (file_cache_lookup (web_caches, url)) 1.275 + continue; 1.276 + 1.277 + /* 1.278 + * Only allow caches to register two more caches: this 1.279 + * small number helps to avoid our list of caches getting 1.280 + * polluted. 1.281 + */ 1.282 + if (++caches > 2) 1.283 + break; 1.284 + 1.285 + /* format is: <url> <last time visited> */ 1.286 + file_cache_insert (web_caches, url, "0"); 1.287 + } 1.288 + 1.289 + /* sync the pending web caches to disk */ 1.290 + file_cache_sync (web_caches); 1.291 +} 1.292 + 1.293 +static void end_request (HttpRequest *req, char *data) 1.294 +{ 1.295 + char *str = req->request; 1.296 + 1.297 + if (str && !strncmp (str, "hostfile", strlen ("hostfile"))) 1.298 + parse_hostfile_response (req, data); 1.299 + else if (str && !strncmp (str, "urlfile", strlen ("urlfile"))) 1.300 + parse_urlfile_response (req, data); 1.301 + else 1.302 + abort (); 1.303 +} 1.304 + 1.305 +/*****************************************************************************/ 1.306 + 1.307 +/* 1.308 + * Return TRUE if newname is in the same domain as oldname. For example, 1.309 + * "new.gwc.example.com", "example.com", and "cache.example.com" are all 1.310 + * considered in the same domain as "www.example.com". 1.311 + * 1.312 + * This is called on redirects, to make sure the cache can't redirect to an 1.313 + * innocent site as part of a DDoS attack. 1.314 + */ 1.315 +static BOOL in_same_domain (const char *oldname, const char *newname) 1.316 +{ 1.317 + return FALSE; 1.318 +#if 0 1.319 + const char *p; 1.320 + const char *largest = NULL; 1.321 + int periods = 0; 1.322 + 1.323 + p = newname; 1.324 + 1.325 + /* get the largest common substring */ 1.326 + while (p != NULL) 1.327 + { 1.328 + if ((largest = strstr (oldname, p))) 1.329 + break; 1.330 + 1.331 + /* advance to next domain part */ 1.332 + p = strchr (p + 1, '.'); 1.333 + } 1.334 + 1.335 + if (!largest) 1.336 + return FALSE; 1.337 + 1.338 + /* 1.339 + * Make sure the substring matches completely to the end. This will 1.340 + * actually fail when it shouldn't if one name includes the '.' toplevel 1.341 + * domain and one doesn't. Oh well. 1.342 + */ 1.343 + if (strcmp (largest, p) != 0) 1.344 + return FALSE; 1.345 + 1.346 + /* 1.347 + * Count the number of periods to find the number of subdomains in the 1.348 + * largest common substring. 1.349 + */ 1.350 + for (p = largest; *p != 0; p++) 1.351 + { 1.352 + if (*p == '.') 1.353 + periods++; 1.354 + } 1.355 + 1.356 + /* 1.357 + * If the last character is the root '.', subtract one, since we are 1.358 + * looking for the number of common subdomains, and the root is shared by 1.359 + * all names. 1.360 + */ 1.361 + if (largest[strlen (largest) - 1] == '.') 1.362 + periods--; 1.363 + 1.364 + /* 1.365 + * If there are two periods, at least two toplevel domains match. 1.366 + */ 1.367 + if (periods >= 2) 1.368 + return TRUE; 1.369 + 1.370 + /* 1.371 + * If there is only one period shared, the names MAY be in the same 1.372 + * domain: one of the names has to be completely contained within the 1.373 + * other, such as the case of "foo.example.com" and "example.com". 1.374 + */ 1.375 + if (periods == 1 && 1.376 + (strcmp (largest, oldname) == 0 || strcmp (largest, newname) == 0)) 1.377 + { 1.378 + return TRUE; 1.379 + } 1.380 + 1.381 + /* not in same domain */ 1.382 + return FALSE; 1.383 +#endif 1.384 +} 1.385 + 1.386 +/* 1.387 + * Called to when the webcache sends a 300-level response with a provided 1.388 + * Location: header. Have to make sure the domain the cache directs us 1.389 + * to is the same. 1.390 + */ 1.391 +static BOOL handle_redirect (HttpRequest *req, const char *new_host, 1.392 + const char *new_path) 1.393 +{ 1.394 + assert (new_host != NULL); 1.395 + 1.396 + if (in_same_domain (req->host, new_host) == FALSE) 1.397 + return FALSE; 1.398 + 1.399 + /* might want to do something else if the ban list later becomes per host 1.400 + * rather than per URL */ 1.401 + ban_webcache (req, "Redirected"); 1.402 + 1.403 + GT->DBGFN (GT, "Redirecting to new webcache %s/%s", new_host, new_path); 1.404 + 1.405 + insert_webcache (new_host, new_path, time (NULL)); 1.406 + file_cache_sync (web_caches); 1.407 + 1.408 + return TRUE; 1.409 +} 1.410 + 1.411 +/*****************************************************************************/ 1.412 + 1.413 +static BOOL handle_recv (HttpRequest *req, char *data, size_t len) 1.414 +{ 1.415 + String *s; 1.416 + 1.417 + /* EOF */ 1.418 + if (!data) 1.419 + { 1.420 + char *str = NULL; 1.421 + 1.422 + if ((s = req->data)) 1.423 + str = s->str; 1.424 + 1.425 + GT->DBGFN (GT, "read %s from server %s", str, req->host); 1.426 + end_request (req, str); 1.427 + 1.428 + /* clear data link */ 1.429 + req->data = NULL; 1.430 + 1.431 + return TRUE; 1.432 + } 1.433 + 1.434 + if (!len) 1.435 + return TRUE; 1.436 + 1.437 + GT->DBGFN (GT, "server sent us: %s", data); 1.438 + 1.439 + if (!(s = req->data) && !(s = req->data = string_new (NULL, 0, 0, TRUE))) 1.440 + return FALSE; 1.441 + 1.442 + if (string_append (s, data) != len) 1.443 + { 1.444 + GT->DBGFN (GT, "string append failed"); 1.445 + return FALSE; 1.446 + } 1.447 + 1.448 + return TRUE; 1.449 +} 1.450 + 1.451 +/*****************************************************************************/ 1.452 + 1.453 +static BOOL handle_add_headers (HttpRequest *req, Dataset **headers) 1.454 +{ 1.455 + /* don't let intermediaries cache our request, I think */ 1.456 + dataset_insertstr (headers, "Cache-Control", "no-cache"); 1.457 + 1.458 + return TRUE; 1.459 +} 1.460 + 1.461 +/*****************************************************************************/ 1.462 + 1.463 +static BOOL parse_host_and_port (char **r_host, in_port_t *r_port) 1.464 +{ 1.465 + char *str; 1.466 + char *host; 1.467 + long port; 1.468 + 1.469 + str = *r_host; 1.470 + 1.471 + if (r_port) 1.472 + *r_port = 80; 1.473 + 1.474 + /* skip leading 'http://' if found */ 1.475 + if (strstr (str, "http://")) 1.476 + str += strlen ("http://"); 1.477 + 1.478 + host = string_sep (&str, ":"); 1.479 + 1.480 + if (!host) 1.481 + return FALSE; 1.482 + 1.483 + *r_host = host; 1.484 + 1.485 + if (str && !string_isempty (str)) 1.486 + { 1.487 + port = gift_strtol (str); 1.488 + 1.489 + /* make sure port is valid */ 1.490 + if (port <= 0 || port >= 65536) 1.491 + return FALSE; 1.492 + 1.493 + *r_port = port; 1.494 + } 1.495 + 1.496 + return TRUE; 1.497 +} 1.498 + 1.499 +static TCPC *open_http_connection (HttpRequest *req, const char *http_name) 1.500 +{ 1.501 + in_addr_t ip; 1.502 + in_port_t port; 1.503 + char *str; 1.504 + char *name; 1.505 + TCPC *c; 1.506 + struct hostent *host; 1.507 + 1.508 + if (!http_name) 1.509 + return NULL; 1.510 + 1.511 + if (!(str = STRDUP (http_name))) 1.512 + return NULL; 1.513 + 1.514 + name = str; 1.515 + 1.516 + if (!parse_host_and_port (&name, &port)) 1.517 + { 1.518 + GT->DBGFN (GT, "error parsing hostname \"%s\"", str); 1.519 + free (str); 1.520 + return NULL; 1.521 + } 1.522 + 1.523 + if (!(host = gt_dns_lookup (name))) 1.524 + { 1.525 + free (str); 1.526 + return NULL; 1.527 + } 1.528 + 1.529 + /* ip is in network-order already */ 1.530 + memcpy (&ip, host->h_addr, MIN (host->h_length, sizeof (ip))); 1.531 + 1.532 + if (net_match_host (ip, "LOCAL")) 1.533 + { 1.534 + free (str); 1.535 + ban_webcache (req, "Resolved to local IP"); 1.536 + return NULL; 1.537 + } 1.538 + 1.539 + c = tcp_open (ip, port, FALSE); 1.540 + if (!c) 1.541 + { 1.542 + GT->DBGFN (GT, "couldn't open connection to %s [%s]: %s", 1.543 + http_name, net_ip_str (ip), GIFT_NETERROR()); 1.544 + } 1.545 + 1.546 + free (str); 1.547 + return c; 1.548 +} 1.549 + 1.550 +/* return the name we have to lookup */ 1.551 +static char *get_http_name (char *name) 1.552 +{ 1.553 + char *proxy; 1.554 + char *host; 1.555 + 1.556 + host = name; 1.557 + proxy = HTTP_PROXY; 1.558 + 1.559 + string_trim (proxy); 1.560 + 1.561 + if (proxy && !string_isempty (proxy)) 1.562 + { 1.563 + /* connect to the proxy instead */ 1.564 + if (STRCMP (proxy, gt_proxy_server) != 0) 1.565 + { 1.566 + GT->DBGFN (GT, "using proxy server %s", proxy); 1.567 + free (gt_proxy_server); 1.568 + gt_proxy_server = STRDUP (proxy); 1.569 + } 1.570 + 1.571 + host = proxy; 1.572 + } 1.573 + 1.574 + return host; 1.575 +} 1.576 + 1.577 +static void check_dns_error (const char *name, HttpRequest *req) 1.578 +{ 1.579 + int error; 1.580 + 1.581 + error = gt_dns_get_errno (); 1.582 + 1.583 + if (!error) 1.584 + return; 1.585 + 1.586 + GT->DBGFN (GT, "lookup failed on \"%s\": %s", name, gt_dns_strerror(error)); 1.587 + 1.588 + /* ban the host, but only if not using a proxy server */ 1.589 + if (error == HOST_NOT_FOUND && gt_proxy_server == NULL) 1.590 + { 1.591 + GT->DBGFN (GT, "webcache \"%s\" not in DNS. banning", name); 1.592 + ban_webcache (req, "Host not found in DNS"); 1.593 + return; 1.594 + } 1.595 +} 1.596 + 1.597 +static BOOL make_request (char *host_name, char *remote_path, char *request) 1.598 +{ 1.599 + HttpRequest *req; 1.600 + TCPC *c; 1.601 + char *resolve_name; 1.602 + char *url; 1.603 + 1.604 + url = stringf_dup ("http://%s/%s", host_name, STRING_NOTNULL(remote_path)); 1.605 + 1.606 + if (!(req = gt_http_request_new (url, request))) 1.607 + { 1.608 + free (url); 1.609 + return FALSE; 1.610 + } 1.611 + 1.612 + free (url); 1.613 + 1.614 + resolve_name = get_http_name (host_name); 1.615 + 1.616 + gt_dns_set_errno (0); 1.617 + 1.618 + if (!(c = open_http_connection (req, resolve_name))) 1.619 + { 1.620 + check_dns_error (resolve_name, req); 1.621 + gt_http_request_close (req, -1); 1.622 + return FALSE; 1.623 + } 1.624 + 1.625 + GT->DBGFN (GT, "opening connection to %s [%s]", 1.626 + resolve_name, net_ip_str (c->host)); 1.627 + 1.628 + req->recv_func = handle_recv; 1.629 + req->add_header_func = handle_add_headers; 1.630 + req->close_req_func = handle_close_request; 1.631 + req->redirect_func = handle_redirect; 1.632 + 1.633 + gt_http_request_set_conn (req, c); /* setup references */ 1.634 + gt_http_request_set_proxy (req, gt_proxy_server); /* maybe use proxy */ 1.635 + gt_http_request_set_timeout (req, 2 * MINUTES); /* don't wait forever */ 1.636 + gt_http_request_set_max_len (req, 65536); /* don't read forever */ 1.637 + 1.638 + input_add (c->fd, c, INPUT_WRITE, 1.639 + (InputCallback)gt_http_request_handle, TIMEOUT_DEF); 1.640 + 1.641 + return TRUE; 1.642 +} 1.643 + 1.644 +/*****************************************************************************/ 1.645 + 1.646 +struct find_rand_args 1.647 +{ 1.648 + int n; 1.649 + time_t now; 1.650 + char *url; 1.651 + char *field; 1.652 +}; 1.653 + 1.654 +/* get a random cache from the webcaches dataset */ 1.655 +static void foreach_rand_cache (ds_data_t *key, ds_data_t *value, 1.656 + struct find_rand_args *args) 1.657 +{ 1.658 + time_t atime; 1.659 + float range = args->n; 1.660 + char *str; 1.661 + char *url = key->data; 1.662 + char *hostname, *path; 1.663 + int ret; 1.664 + 1.665 + if (!parse_web_cache_value (value->data, &atime)) 1.666 + return; 1.667 + 1.668 + /* skip the cache entirely if we've retried too soon */ 1.669 + if (args->now - atime < CACHE_RETRY_TIME) 1.670 + return; 1.671 + 1.672 + /* 1.673 + * Make sure the cache has a parseable url 1.674 + * 1.675 + * TODO: This is ugly, it really should be parsed into a 1.676 + * a data structure once instead. 1.677 + */ 1.678 + str = STRDUP (url); 1.679 + ret = gt_http_url_parse (str, &hostname, &path); 1.680 + free (str); 1.681 + 1.682 + if (!ret) 1.683 + { 1.684 + GT->warn (GT, "bad webcache url \"%s\" from %s/gwebcaches", 1.685 + key->data, gift_conf_path ("Gnutella")); 1.686 + return; 1.687 + } 1.688 + 1.689 + /* decrease probability of selecting the next web cache */ 1.690 + args->n++; 1.691 + 1.692 + /* 1.693 + * Select this webcache with probability 1/n. 1.694 + * 1.695 + * Also select this cache if we haven't chosen one yet, which may be the 1.696 + * case on if the index of the cache is > 0 when there are banned caches. 1.697 + */ 1.698 + if (args->url == NULL || 1.699 + range * rand() / (RAND_MAX + 1.0) < 1.0) 1.700 + { 1.701 + char *keystr = key->data; 1.702 + char *valuestr = value->data; 1.703 + 1.704 + /* check if this is a bad gwebcache */ 1.705 + if (file_cache_lookup (bad_caches, url)) 1.706 + { 1.707 +#if 1 1.708 + GT->warn (GT, "skipping webcache %s, in bad gwebcaches", url); 1.709 +#endif 1.710 + /* pretend we didn't select this to ensure equal distribution */ 1.711 + args->n--; 1.712 + 1.713 + return; 1.714 + } 1.715 + 1.716 + /* free the old values */ 1.717 + free (args->url); 1.718 + free (args->field); 1.719 + 1.720 + args->url = STRDUP (keystr); 1.721 + args->field = STRDUP (valuestr); 1.722 + } 1.723 +} 1.724 + 1.725 +static BOOL get_random_cache (time_t now, char **r_host_name, 1.726 + char **r_remote_path) 1.727 +{ 1.728 + int ret; 1.729 + struct find_rand_args args; 1.730 + 1.731 + args.n = 1; /* initial probability */ 1.732 + args.now = now; /* current time */ 1.733 + args.url = NULL; 1.734 + args.field = NULL; 1.735 + 1.736 + dataset_foreach (web_caches->d, DS_FOREACH(foreach_rand_cache), &args); 1.737 + 1.738 + if (!args.url) 1.739 + { 1.740 + GT->DBGFN (GT, "couldn't find random cache"); 1.741 + return FALSE; 1.742 + } 1.743 + 1.744 + ret = gt_http_url_parse (args.url, r_host_name, r_remote_path); 1.745 + 1.746 + if (!*r_host_name || !*r_remote_path) 1.747 + { 1.748 + free (args.url); 1.749 + free (args.field); 1.750 + return FALSE; 1.751 + } 1.752 + 1.753 + *r_host_name = STRDUP (*r_host_name); 1.754 + *r_remote_path = STRDUP (*r_remote_path); 1.755 + 1.756 + /* free the original buffer */ 1.757 + free (args.url); 1.758 + free (args.field); 1.759 + 1.760 + return ret; 1.761 +} 1.762 + 1.763 +static void access_gwebcaches (void) 1.764 +{ 1.765 + int len; 1.766 + char *host_name; 1.767 + char *remote_path; 1.768 + time_t now; 1.769 + int host_requests = 0; 1.770 +#if 0 1.771 + int url_requests = 0; 1.772 +#endif 1.773 + int max_requests = 1; 1.774 + BOOL ret; 1.775 + BOOL need_sync; 1.776 + 1.777 + /* 1.778 + * We may get called while a check of the gwebcaches is already 1.779 + * in progress. 1.780 + */ 1.781 + if (checking_caches) 1.782 + { 1.783 + GT->DBGFN (GT, "Access already in progress"); 1.784 + return; 1.785 + } 1.786 + 1.787 + now = time (NULL); 1.788 + 1.789 + len = dataset_length (web_caches->d); 1.790 + 1.791 + if (max_requests > len) 1.792 + max_requests = len; 1.793 + 1.794 + need_sync = FALSE; 1.795 + 1.796 + while (host_requests < max_requests) 1.797 + { 1.798 + if (!get_random_cache (now, &host_name, &remote_path)) 1.799 + { 1.800 + GT->DBGFN (GT, "error looking up cache"); 1.801 + break; 1.802 + } 1.803 + 1.804 +#if 0 1.805 + /* make a url request sometimes to keep the cache file up to date, but 1.806 + * mostly ask for hosts */ 1.807 + if (10.0 * rand() / (RAND_MAX + 1.0) < 1.0) 1.808 + { 1.809 + ret = make_request (host_name, remote_path, 1.810 + "urlfile=1&client=GIFT&version=" GT_VERSION); 1.811 + url_requests++; 1.812 + } 1.813 + else 1.814 +#endif 1.815 + { 1.816 + ret = make_request (host_name, remote_path, 1.817 + "hostfile=1&client=GIFT&version=" GT_VERSION); 1.818 + 1.819 + if (ret) 1.820 + checking_caches = TRUE; 1.821 + 1.822 + host_requests++; 1.823 + } 1.824 + 1.825 + if (ret) 1.826 + { 1.827 + GT->DBGFN (GT, "hitting web cache [total cache hits %u] " 1.828 + "(cache: http://%s/%s)", cache_hits, 1.829 + host_name, STRING_NOTNULL(remote_path)); 1.830 + 1.831 + cache_hits++; 1.832 + need_sync = TRUE; 1.833 + 1.834 + /* reset the atime for the cache */ 1.835 + insert_webcache (host_name, remote_path, now); 1.836 + } 1.837 + 1.838 + free (host_name); 1.839 + free (remote_path); 1.840 + } 1.841 + 1.842 + /* only sync when we successfully accessed a cache */ 1.843 + if (need_sync) 1.844 + file_cache_sync (web_caches); 1.845 +} 1.846 + 1.847 +static BOOL webcache_update (void *udata) 1.848 +{ 1.849 + char *webcache_file; 1.850 + int web_exists; 1.851 + time_t now; 1.852 + size_t nodes_len; 1.853 + struct stat st; 1.854 + 1.855 + if (GNUTELLA_LOCAL_MODE) 1.856 + return TRUE; 1.857 + 1.858 + now = time (NULL); 1.859 + nodes_len = gt_conn_length (GT_NODE_NONE, GT_NODE_ANY); 1.860 + 1.861 + /* 1.862 + * If we've already accessed the caches successfully, we won't 1.863 + * allow another access to go through, _unless_ the node list 1.864 + * is small enough, in which case it could be we really do need 1.865 + * to access the caches. 1.866 + */ 1.867 + if (now < next_atime && nodes_len >= 20) 1.868 + return FALSE; 1.869 + 1.870 + webcache_file = STRDUP (gift_conf_path ("Gnutella/gwebcaches")); 1.871 + web_exists = file_stat (webcache_file, &st); 1.872 + 1.873 + if (!web_exists) 1.874 + { 1.875 + GIFT_ERROR (("gwebcaches file doesn't exist")); 1.876 + return FALSE; 1.877 + } 1.878 + 1.879 + /* 1.880 + * next_atime, the absolute next time we allow ourselves to contact the 1.881 + * caches, gets set when we sucessfully access the caches, and if we 1.882 + * manage to get some hosts from a cache we access in an exponentially 1.883 + * decreasing interval. 1.884 + */ 1.885 + access_gwebcaches (); 1.886 + 1.887 + free (webcache_file); 1.888 + return TRUE; 1.889 +} 1.890 + 1.891 +/*****************************************************************************/ 1.892 + 1.893 +void gt_web_cache_update (void) 1.894 +{ 1.895 + webcache_update (NULL); 1.896 +} 1.897 + 1.898 +BOOL gt_web_cache_init (void) 1.899 +{ 1.900 + /* 1.901 + * Copy the gwebcaches file to from the data dir to 1.902 + * ~/.giFT/Gnutella if it is newer or if ~/.giFT/Gnutella/gwebcaches 1.903 + * doesn't exist. 1.904 + */ 1.905 + gt_config_load_file ("Gnutella/gwebcaches", TRUE, FALSE); 1.906 + 1.907 + web_caches = file_cache_new (gift_conf_path ("Gnutella/gwebcaches")); 1.908 + bad_caches = file_cache_new (gift_conf_path ("Gnutella/bad_gwebcaches")); 1.909 + 1.910 + if (!web_caches) 1.911 + return FALSE; 1.912 + 1.913 + return TRUE; 1.914 +} 1.915 + 1.916 +void gt_web_cache_cleanup (void) 1.917 +{ 1.918 + file_cache_free (web_caches); 1.919 + web_caches = NULL; 1.920 + 1.921 + file_cache_free (bad_caches); 1.922 + bad_caches = NULL; 1.923 + 1.924 + cache_hits = 0; 1.925 + next_atime = 0; 1.926 + 1.927 + checking_caches = FALSE; 1.928 +}