paulo@0: /* paulo@0: * $Id: gt_web_cache.c,v 1.65 2006/08/06 16:53:36 hexwab Exp $ paulo@0: * paulo@0: * Copyright (C) 2001-2003 giFT project (gift.sourceforge.net) paulo@0: * paulo@0: * This program is free software; you can redistribute it and/or modify it paulo@0: * under the terms of the GNU General Public License as published by the paulo@0: * Free Software Foundation; either version 2, or (at your option) any paulo@0: * later version. paulo@0: * paulo@0: * This program is distributed in the hope that it will be useful, but paulo@0: * WITHOUT ANY WARRANTY; without even the implied warranty of paulo@0: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU paulo@0: * General Public License for more details. paulo@0: */ paulo@0: paulo@0: #include "gt_gnutella.h" paulo@0: paulo@0: #include "file_cache.h" paulo@0: #include "http_request.h" paulo@0: paulo@0: #include "gt_connect.h" paulo@0: #include "gt_node.h" paulo@0: #include "gt_node_list.h" paulo@0: #include "gt_netorg.h" paulo@0: paulo@0: #include "gt_web_cache.h" paulo@0: #include "gt_conf.h" paulo@0: paulo@0: #include "dns.h" paulo@0: paulo@0: /*****************************************************************************/ paulo@0: paulo@0: /* minimum time to wait before reconnecting to a webcache */ paulo@0: #define CACHE_RETRY_TIME (8 * EHOURS) paulo@0: paulo@0: /*****************************************************************************/ paulo@0: paulo@0: /* number of times we have hit some gwebcaches */ paulo@0: static int cache_hits; paulo@0: paulo@0: /* the absolute next time we will allow ourselves to access a cache */ paulo@0: static time_t next_atime; paulo@0: paulo@0: /* amount of time to layoff the caches once we've received some data */ paulo@0: static time_t backoff_time = 1 * EHOURS; paulo@0: paulo@0: /* holds all the caches */ paulo@0: static FileCache *web_caches; paulo@0: paulo@0: /* proxy server to contact */ paulo@0: static char *gt_proxy_server; paulo@0: paulo@0: /* webcaches that succeeded connecting, but returned errors or an paulo@0: * unparseable response */ paulo@0: static FileCache *bad_caches; paulo@0: paulo@0: /* whether we are in the process of checking the caches */ paulo@0: static BOOL checking_caches; paulo@0: paulo@0: /*****************************************************************************/ paulo@0: paulo@0: static void parse_hostfile_response (HttpRequest *http_req, char *hosts_file); paulo@0: static void parse_urlfile_response (HttpRequest *http_req, char *url_file); paulo@0: paulo@0: /*****************************************************************************/ paulo@0: paulo@0: /* parse the extended data in the webcaches file, now its just mtime */ paulo@0: static BOOL parse_web_cache_value (char *value, time_t *r_atime) paulo@0: { paulo@0: time_t atime; paulo@0: paulo@0: if ((atime = ATOUL (value)) == (unsigned long) -1) paulo@0: atime = 0; paulo@0: paulo@0: if (r_atime) paulo@0: *r_atime = atime; paulo@0: paulo@0: return TRUE; paulo@0: } paulo@0: paulo@0: /*****************************************************************************/ paulo@0: paulo@0: static char *new_webcache_url (const char *host, const char *path) paulo@0: { paulo@0: return stringf_dup ("http://%s/%s", host, STRING_NOTNULL(path)); paulo@0: } paulo@0: paulo@0: static void ban_webcache (HttpRequest *req, const char *why) paulo@0: { paulo@0: char *url; paulo@0: paulo@0: url = new_webcache_url (req->host, req->path); paulo@0: GT->dbg (GT, "banning webcache %s", url); paulo@0: paulo@0: file_cache_insert (bad_caches, url, why); paulo@0: file_cache_sync (bad_caches); paulo@0: paulo@0: free (url); paulo@0: } paulo@0: paulo@0: static void insert_webcache (const char *host_name, const char *remote_path, paulo@0: time_t atime) paulo@0: { paulo@0: char *url; paulo@0: char *field; paulo@0: paulo@0: url = new_webcache_url (host_name, remote_path); paulo@0: field = stringf_dup ("%lu", atime); paulo@0: paulo@0: file_cache_insert (web_caches, url, field); paulo@0: paulo@0: free (url); paulo@0: free (field); paulo@0: } paulo@0: paulo@0: /*****************************************************************************/ paulo@0: paulo@0: static void handle_close_request (HttpRequest *req, int error_code) paulo@0: { paulo@0: String *s; paulo@0: paulo@0: if (error_code < 0 || error_code < 200 || error_code >= 300) paulo@0: { paulo@0: if (error_code == -1) paulo@0: { paulo@0: /* the error was our fault, out of mem, etc. dont do anything */ paulo@0: GT->DBGFN (GT, "connect to server %s failed for some reason", paulo@0: req->host); paulo@0: } paulo@0: else paulo@0: { paulo@0: char err[32]; paulo@0: paulo@0: snprintf (err, sizeof(err), "Received error %d", error_code); paulo@0: paulo@0: /* paulo@0: * Not found, internal server error, or too many redirects: ban paulo@0: * the server's URL paulo@0: */ paulo@0: GT->DBGFN (GT, "server %s returned error %i", req->host, paulo@0: error_code); paulo@0: ban_webcache (req, err); paulo@0: } paulo@0: } paulo@0: paulo@0: /* TODO: this assumes this is the one hostfile request flying around, paulo@0: * and not a urlfile request, which probably needs to be handled paulo@0: * separately */ paulo@0: checking_caches = FALSE; paulo@0: paulo@0: if ((s = req->data)) paulo@0: string_free (s); paulo@0: } paulo@0: paulo@0: static void parse_hostfile_response (HttpRequest *http_req, char *host_file) paulo@0: { paulo@0: int hosts = 0; paulo@0: GtNode *node; paulo@0: time_t now; paulo@0: paulo@0: if (!host_file) paulo@0: { paulo@0: GT->DBGFN (GT, "empty host file from %s", http_req->host); paulo@0: return; paulo@0: } paulo@0: paulo@0: GT->DBGFN (GT, "hostfile from server = %s", host_file); paulo@0: paulo@0: now = time (NULL); paulo@0: paulo@0: /* paulo@0: * If the response start with "ERROR: " (or pseudo-html '<' char), ban the paulo@0: * webcache. paulo@0: */ paulo@0: if (!strncasecmp (host_file, "ERROR", sizeof ("ERROR") - 1) || paulo@0: host_file[0] == '<') paulo@0: { paulo@0: ban_webcache (http_req, "Malformed response content"); paulo@0: return; paulo@0: } paulo@0: paulo@0: while (host_file && *host_file) paulo@0: { paulo@0: char *host; paulo@0: in_addr_t ip; paulo@0: in_port_t port; paulo@0: paulo@0: host = string_sep_set (&host_file, "\r\n"); paulo@0: paulo@0: ip = net_ip (string_sep (&host, ":")); paulo@0: port = ATOI (host); paulo@0: paulo@0: if (!port || !ip || ip == INADDR_NONE) paulo@0: continue; paulo@0: paulo@0: GT->DBGFN (GT, "registering %s:%hu (from cache %s)", net_ip_str (ip), paulo@0: port, http_req->host); paulo@0: paulo@0: /* register the hosts as ultrapeers */ paulo@0: node = gt_node_register (ip, port, GT_NODE_ULTRA); paulo@0: hosts++; paulo@0: paulo@0: if (!node) paulo@0: continue; paulo@0: paulo@0: /* set the vitality on this node to preserve it across restarts */ paulo@0: node->vitality = now; paulo@0: paulo@0: /* might be connected already */ paulo@0: if (node->state != GT_NODE_DISCONNECTED) paulo@0: continue; paulo@0: paulo@0: /* try to connect to the first 5 */ paulo@0: if (hosts <= 5 && gt_conn_need_connections (GT_NODE_ULTRA)) paulo@0: gt_connect (node); paulo@0: paulo@0: /* don't allow the cache to register an infinite number of hosts */ paulo@0: if (hosts >= 50) paulo@0: break; paulo@0: } paulo@0: paulo@0: /* save the nodes we added to disk so we dont hit the caches again */ paulo@0: gt_node_list_save (); paulo@0: paulo@0: /* paulo@0: * Do an exponential backoff from the caches. If we were online and paulo@0: * able to receive data, we should be getting node information paulo@0: * some other way now. paulo@0: */ paulo@0: if (hosts >= 5) paulo@0: { paulo@0: next_atime = now + backoff_time; paulo@0: backoff_time *= 2; paulo@0: } paulo@0: } paulo@0: paulo@0: static void parse_urlfile_response (HttpRequest *http_req, char *url_file) paulo@0: { paulo@0: int caches = 0; paulo@0: paulo@0: if (!url_file) paulo@0: { paulo@0: GT->DBGFN (GT, "empty url file from %s", http_req->host); paulo@0: return; paulo@0: } paulo@0: paulo@0: GT->DBGFN (GT, "urlfile from server = %s", url_file); paulo@0: paulo@0: while (url_file && *url_file) paulo@0: { paulo@0: char *url; paulo@0: char *host_name; paulo@0: char *remote_path; paulo@0: paulo@0: url = string_sep_set (&url_file, "\r\n"); paulo@0: paulo@0: /* skip past http:// */ paulo@0: string_sep (&url, "http://"); paulo@0: paulo@0: host_name = string_sep (&url, "/"); paulo@0: remote_path = url; paulo@0: paulo@0: /* NOTE: remote_path is possibly empty */ paulo@0: if (!host_name) paulo@0: continue; paulo@0: paulo@0: url = stringf ("http://%s/%s", host_name, STRING_NOTNULL(remote_path)); paulo@0: paulo@0: /* if the webcache is already in our db, skip it */ paulo@0: if (file_cache_lookup (web_caches, url)) paulo@0: continue; paulo@0: paulo@0: /* paulo@0: * Only allow caches to register two more caches: this paulo@0: * small number helps to avoid our list of caches getting paulo@0: * polluted. paulo@0: */ paulo@0: if (++caches > 2) paulo@0: break; paulo@0: paulo@0: /* format is: */ paulo@0: file_cache_insert (web_caches, url, "0"); paulo@0: } paulo@0: paulo@0: /* sync the pending web caches to disk */ paulo@0: file_cache_sync (web_caches); paulo@0: } paulo@0: paulo@0: static void end_request (HttpRequest *req, char *data) paulo@0: { paulo@0: char *str = req->request; paulo@0: paulo@0: if (str && !strncmp (str, "hostfile", strlen ("hostfile"))) paulo@0: parse_hostfile_response (req, data); paulo@0: else if (str && !strncmp (str, "urlfile", strlen ("urlfile"))) paulo@0: parse_urlfile_response (req, data); paulo@0: else paulo@0: abort (); paulo@0: } paulo@0: paulo@0: /*****************************************************************************/ paulo@0: paulo@0: /* paulo@0: * Return TRUE if newname is in the same domain as oldname. For example, paulo@0: * "new.gwc.example.com", "example.com", and "cache.example.com" are all paulo@0: * considered in the same domain as "www.example.com". paulo@0: * paulo@0: * This is called on redirects, to make sure the cache can't redirect to an paulo@0: * innocent site as part of a DDoS attack. paulo@0: */ paulo@0: static BOOL in_same_domain (const char *oldname, const char *newname) paulo@0: { paulo@0: return FALSE; paulo@0: #if 0 paulo@0: const char *p; paulo@0: const char *largest = NULL; paulo@0: int periods = 0; paulo@0: paulo@0: p = newname; paulo@0: paulo@0: /* get the largest common substring */ paulo@0: while (p != NULL) paulo@0: { paulo@0: if ((largest = strstr (oldname, p))) paulo@0: break; paulo@0: paulo@0: /* advance to next domain part */ paulo@0: p = strchr (p + 1, '.'); paulo@0: } paulo@0: paulo@0: if (!largest) paulo@0: return FALSE; paulo@0: paulo@0: /* paulo@0: * Make sure the substring matches completely to the end. This will paulo@0: * actually fail when it shouldn't if one name includes the '.' toplevel paulo@0: * domain and one doesn't. Oh well. paulo@0: */ paulo@0: if (strcmp (largest, p) != 0) paulo@0: return FALSE; paulo@0: paulo@0: /* paulo@0: * Count the number of periods to find the number of subdomains in the paulo@0: * largest common substring. paulo@0: */ paulo@0: for (p = largest; *p != 0; p++) paulo@0: { paulo@0: if (*p == '.') paulo@0: periods++; paulo@0: } paulo@0: paulo@0: /* paulo@0: * If the last character is the root '.', subtract one, since we are paulo@0: * looking for the number of common subdomains, and the root is shared by paulo@0: * all names. paulo@0: */ paulo@0: if (largest[strlen (largest) - 1] == '.') paulo@0: periods--; paulo@0: paulo@0: /* paulo@0: * If there are two periods, at least two toplevel domains match. paulo@0: */ paulo@0: if (periods >= 2) paulo@0: return TRUE; paulo@0: paulo@0: /* paulo@0: * If there is only one period shared, the names MAY be in the same paulo@0: * domain: one of the names has to be completely contained within the paulo@0: * other, such as the case of "foo.example.com" and "example.com". paulo@0: */ paulo@0: if (periods == 1 && paulo@0: (strcmp (largest, oldname) == 0 || strcmp (largest, newname) == 0)) paulo@0: { paulo@0: return TRUE; paulo@0: } paulo@0: paulo@0: /* not in same domain */ paulo@0: return FALSE; paulo@0: #endif paulo@0: } paulo@0: paulo@0: /* paulo@0: * Called to when the webcache sends a 300-level response with a provided paulo@0: * Location: header. Have to make sure the domain the cache directs us paulo@0: * to is the same. paulo@0: */ paulo@0: static BOOL handle_redirect (HttpRequest *req, const char *new_host, paulo@0: const char *new_path) paulo@0: { paulo@0: assert (new_host != NULL); paulo@0: paulo@0: if (in_same_domain (req->host, new_host) == FALSE) paulo@0: return FALSE; paulo@0: paulo@0: /* might want to do something else if the ban list later becomes per host paulo@0: * rather than per URL */ paulo@0: ban_webcache (req, "Redirected"); paulo@0: paulo@0: GT->DBGFN (GT, "Redirecting to new webcache %s/%s", new_host, new_path); paulo@0: paulo@0: insert_webcache (new_host, new_path, time (NULL)); paulo@0: file_cache_sync (web_caches); paulo@0: paulo@0: return TRUE; paulo@0: } paulo@0: paulo@0: /*****************************************************************************/ paulo@0: paulo@0: static BOOL handle_recv (HttpRequest *req, char *data, size_t len) paulo@0: { paulo@0: String *s; paulo@0: paulo@0: /* EOF */ paulo@0: if (!data) paulo@0: { paulo@0: char *str = NULL; paulo@0: paulo@0: if ((s = req->data)) paulo@0: str = s->str; paulo@0: paulo@0: GT->DBGFN (GT, "read %s from server %s", str, req->host); paulo@0: end_request (req, str); paulo@0: paulo@0: /* clear data link */ paulo@0: req->data = NULL; paulo@0: paulo@0: return TRUE; paulo@0: } paulo@0: paulo@0: if (!len) paulo@0: return TRUE; paulo@0: paulo@0: GT->DBGFN (GT, "server sent us: %s", data); paulo@0: paulo@0: if (!(s = req->data) && !(s = req->data = string_new (NULL, 0, 0, TRUE))) paulo@0: return FALSE; paulo@0: paulo@0: if (string_append (s, data) != len) paulo@0: { paulo@0: GT->DBGFN (GT, "string append failed"); paulo@0: return FALSE; paulo@0: } paulo@0: paulo@0: return TRUE; paulo@0: } paulo@0: paulo@0: /*****************************************************************************/ paulo@0: paulo@0: static BOOL handle_add_headers (HttpRequest *req, Dataset **headers) paulo@0: { paulo@0: /* don't let intermediaries cache our request, I think */ paulo@0: dataset_insertstr (headers, "Cache-Control", "no-cache"); paulo@0: paulo@0: return TRUE; paulo@0: } paulo@0: paulo@0: /*****************************************************************************/ paulo@0: paulo@0: static BOOL parse_host_and_port (char **r_host, in_port_t *r_port) paulo@0: { paulo@0: char *str; paulo@0: char *host; paulo@0: long port; paulo@0: paulo@0: str = *r_host; paulo@0: paulo@0: if (r_port) paulo@0: *r_port = 80; paulo@0: paulo@0: /* skip leading 'http://' if found */ paulo@0: if (strstr (str, "http://")) paulo@0: str += strlen ("http://"); paulo@0: paulo@0: host = string_sep (&str, ":"); paulo@0: paulo@0: if (!host) paulo@0: return FALSE; paulo@0: paulo@0: *r_host = host; paulo@0: paulo@0: if (str && !string_isempty (str)) paulo@0: { paulo@0: port = gift_strtol (str); paulo@0: paulo@0: /* make sure port is valid */ paulo@0: if (port <= 0 || port >= 65536) paulo@0: return FALSE; paulo@0: paulo@0: *r_port = port; paulo@0: } paulo@0: paulo@0: return TRUE; paulo@0: } paulo@0: paulo@0: static TCPC *open_http_connection (HttpRequest *req, const char *http_name) paulo@0: { paulo@0: in_addr_t ip; paulo@0: in_port_t port; paulo@0: char *str; paulo@0: char *name; paulo@0: TCPC *c; paulo@0: struct hostent *host; paulo@0: paulo@0: if (!http_name) paulo@0: return NULL; paulo@0: paulo@0: if (!(str = STRDUP (http_name))) paulo@0: return NULL; paulo@0: paulo@0: name = str; paulo@0: paulo@0: if (!parse_host_and_port (&name, &port)) paulo@0: { paulo@0: GT->DBGFN (GT, "error parsing hostname \"%s\"", str); paulo@0: free (str); paulo@0: return NULL; paulo@0: } paulo@0: paulo@0: if (!(host = gt_dns_lookup (name))) paulo@0: { paulo@0: free (str); paulo@0: return NULL; paulo@0: } paulo@0: paulo@0: /* ip is in network-order already */ paulo@0: memcpy (&ip, host->h_addr, MIN (host->h_length, sizeof (ip))); paulo@0: paulo@0: if (net_match_host (ip, "LOCAL")) paulo@0: { paulo@0: free (str); paulo@0: ban_webcache (req, "Resolved to local IP"); paulo@0: return NULL; paulo@0: } paulo@0: paulo@0: c = tcp_open (ip, port, FALSE); paulo@0: if (!c) paulo@0: { paulo@0: GT->DBGFN (GT, "couldn't open connection to %s [%s]: %s", paulo@0: http_name, net_ip_str (ip), GIFT_NETERROR()); paulo@0: } paulo@0: paulo@0: free (str); paulo@0: return c; paulo@0: } paulo@0: paulo@0: /* return the name we have to lookup */ paulo@0: static char *get_http_name (char *name) paulo@0: { paulo@0: char *proxy; paulo@0: char *host; paulo@0: paulo@0: host = name; paulo@0: proxy = HTTP_PROXY; paulo@0: paulo@0: string_trim (proxy); paulo@0: paulo@0: if (proxy && !string_isempty (proxy)) paulo@0: { paulo@0: /* connect to the proxy instead */ paulo@0: if (STRCMP (proxy, gt_proxy_server) != 0) paulo@0: { paulo@0: GT->DBGFN (GT, "using proxy server %s", proxy); paulo@0: free (gt_proxy_server); paulo@0: gt_proxy_server = STRDUP (proxy); paulo@0: } paulo@0: paulo@0: host = proxy; paulo@0: } paulo@0: paulo@0: return host; paulo@0: } paulo@0: paulo@0: static void check_dns_error (const char *name, HttpRequest *req) paulo@0: { paulo@0: int error; paulo@0: paulo@0: error = gt_dns_get_errno (); paulo@0: paulo@0: if (!error) paulo@0: return; paulo@0: paulo@0: GT->DBGFN (GT, "lookup failed on \"%s\": %s", name, gt_dns_strerror(error)); paulo@0: paulo@0: /* ban the host, but only if not using a proxy server */ paulo@0: if (error == HOST_NOT_FOUND && gt_proxy_server == NULL) paulo@0: { paulo@0: GT->DBGFN (GT, "webcache \"%s\" not in DNS. banning", name); paulo@0: ban_webcache (req, "Host not found in DNS"); paulo@0: return; paulo@0: } paulo@0: } paulo@0: paulo@0: static BOOL make_request (char *host_name, char *remote_path, char *request) paulo@0: { paulo@0: HttpRequest *req; paulo@0: TCPC *c; paulo@0: char *resolve_name; paulo@0: char *url; paulo@0: paulo@0: url = stringf_dup ("http://%s/%s", host_name, STRING_NOTNULL(remote_path)); paulo@0: paulo@0: if (!(req = gt_http_request_new (url, request))) paulo@0: { paulo@0: free (url); paulo@0: return FALSE; paulo@0: } paulo@0: paulo@0: free (url); paulo@0: paulo@0: resolve_name = get_http_name (host_name); paulo@0: paulo@0: gt_dns_set_errno (0); paulo@0: paulo@0: if (!(c = open_http_connection (req, resolve_name))) paulo@0: { paulo@0: check_dns_error (resolve_name, req); paulo@0: gt_http_request_close (req, -1); paulo@0: return FALSE; paulo@0: } paulo@0: paulo@0: GT->DBGFN (GT, "opening connection to %s [%s]", paulo@0: resolve_name, net_ip_str (c->host)); paulo@0: paulo@0: req->recv_func = handle_recv; paulo@0: req->add_header_func = handle_add_headers; paulo@0: req->close_req_func = handle_close_request; paulo@0: req->redirect_func = handle_redirect; paulo@0: paulo@0: gt_http_request_set_conn (req, c); /* setup references */ paulo@0: gt_http_request_set_proxy (req, gt_proxy_server); /* maybe use proxy */ paulo@0: gt_http_request_set_timeout (req, 2 * MINUTES); /* don't wait forever */ paulo@0: gt_http_request_set_max_len (req, 65536); /* don't read forever */ paulo@0: paulo@0: input_add (c->fd, c, INPUT_WRITE, paulo@0: (InputCallback)gt_http_request_handle, TIMEOUT_DEF); paulo@0: paulo@0: return TRUE; paulo@0: } paulo@0: paulo@0: /*****************************************************************************/ paulo@0: paulo@0: struct find_rand_args paulo@0: { paulo@0: int n; paulo@0: time_t now; paulo@0: char *url; paulo@0: char *field; paulo@0: }; paulo@0: paulo@0: /* get a random cache from the webcaches dataset */ paulo@0: static void foreach_rand_cache (ds_data_t *key, ds_data_t *value, paulo@0: struct find_rand_args *args) paulo@0: { paulo@0: time_t atime; paulo@0: float range = args->n; paulo@0: char *str; paulo@0: char *url = key->data; paulo@0: char *hostname, *path; paulo@0: int ret; paulo@0: paulo@0: if (!parse_web_cache_value (value->data, &atime)) paulo@0: return; paulo@0: paulo@0: /* skip the cache entirely if we've retried too soon */ paulo@0: if (args->now - atime < CACHE_RETRY_TIME) paulo@0: return; paulo@0: paulo@0: /* paulo@0: * Make sure the cache has a parseable url paulo@0: * paulo@0: * TODO: This is ugly, it really should be parsed into a paulo@0: * a data structure once instead. paulo@0: */ paulo@0: str = STRDUP (url); paulo@0: ret = gt_http_url_parse (str, &hostname, &path); paulo@0: free (str); paulo@0: paulo@0: if (!ret) paulo@0: { paulo@0: GT->warn (GT, "bad webcache url \"%s\" from %s/gwebcaches", paulo@0: key->data, gift_conf_path ("Gnutella")); paulo@0: return; paulo@0: } paulo@0: paulo@0: /* decrease probability of selecting the next web cache */ paulo@0: args->n++; paulo@0: paulo@0: /* paulo@0: * Select this webcache with probability 1/n. paulo@0: * paulo@0: * Also select this cache if we haven't chosen one yet, which may be the paulo@0: * case on if the index of the cache is > 0 when there are banned caches. paulo@0: */ paulo@0: if (args->url == NULL || paulo@0: range * rand() / (RAND_MAX + 1.0) < 1.0) paulo@0: { paulo@0: char *keystr = key->data; paulo@0: char *valuestr = value->data; paulo@0: paulo@0: /* check if this is a bad gwebcache */ paulo@0: if (file_cache_lookup (bad_caches, url)) paulo@0: { paulo@0: #if 1 paulo@0: GT->warn (GT, "skipping webcache %s, in bad gwebcaches", url); paulo@0: #endif paulo@0: /* pretend we didn't select this to ensure equal distribution */ paulo@0: args->n--; paulo@0: paulo@0: return; paulo@0: } paulo@0: paulo@0: /* free the old values */ paulo@0: free (args->url); paulo@0: free (args->field); paulo@0: paulo@0: args->url = STRDUP (keystr); paulo@0: args->field = STRDUP (valuestr); paulo@0: } paulo@0: } paulo@0: paulo@0: static BOOL get_random_cache (time_t now, char **r_host_name, paulo@0: char **r_remote_path) paulo@0: { paulo@0: int ret; paulo@0: struct find_rand_args args; paulo@0: paulo@0: args.n = 1; /* initial probability */ paulo@0: args.now = now; /* current time */ paulo@0: args.url = NULL; paulo@0: args.field = NULL; paulo@0: paulo@0: dataset_foreach (web_caches->d, DS_FOREACH(foreach_rand_cache), &args); paulo@0: paulo@0: if (!args.url) paulo@0: { paulo@0: GT->DBGFN (GT, "couldn't find random cache"); paulo@0: return FALSE; paulo@0: } paulo@0: paulo@0: ret = gt_http_url_parse (args.url, r_host_name, r_remote_path); paulo@0: paulo@0: if (!*r_host_name || !*r_remote_path) paulo@0: { paulo@0: free (args.url); paulo@0: free (args.field); paulo@0: return FALSE; paulo@0: } paulo@0: paulo@0: *r_host_name = STRDUP (*r_host_name); paulo@0: *r_remote_path = STRDUP (*r_remote_path); paulo@0: paulo@0: /* free the original buffer */ paulo@0: free (args.url); paulo@0: free (args.field); paulo@0: paulo@0: return ret; paulo@0: } paulo@0: paulo@0: static void access_gwebcaches (void) paulo@0: { paulo@0: int len; paulo@0: char *host_name; paulo@0: char *remote_path; paulo@0: time_t now; paulo@0: int host_requests = 0; paulo@0: #if 0 paulo@0: int url_requests = 0; paulo@0: #endif paulo@0: int max_requests = 1; paulo@0: BOOL ret; paulo@0: BOOL need_sync; paulo@0: paulo@0: /* paulo@0: * We may get called while a check of the gwebcaches is already paulo@0: * in progress. paulo@0: */ paulo@0: if (checking_caches) paulo@0: { paulo@0: GT->DBGFN (GT, "Access already in progress"); paulo@0: return; paulo@0: } paulo@0: paulo@0: now = time (NULL); paulo@0: paulo@0: len = dataset_length (web_caches->d); paulo@0: paulo@0: if (max_requests > len) paulo@0: max_requests = len; paulo@0: paulo@0: need_sync = FALSE; paulo@0: paulo@0: while (host_requests < max_requests) paulo@0: { paulo@0: if (!get_random_cache (now, &host_name, &remote_path)) paulo@0: { paulo@0: GT->DBGFN (GT, "error looking up cache"); paulo@0: break; paulo@0: } paulo@0: paulo@0: #if 0 paulo@0: /* make a url request sometimes to keep the cache file up to date, but paulo@0: * mostly ask for hosts */ paulo@0: if (10.0 * rand() / (RAND_MAX + 1.0) < 1.0) paulo@0: { paulo@0: ret = make_request (host_name, remote_path, paulo@0: "urlfile=1&client=GIFT&version=" GT_VERSION); paulo@0: url_requests++; paulo@0: } paulo@0: else paulo@0: #endif paulo@0: { paulo@0: ret = make_request (host_name, remote_path, paulo@0: "hostfile=1&client=GIFT&version=" GT_VERSION); paulo@0: paulo@0: if (ret) paulo@0: checking_caches = TRUE; paulo@0: paulo@0: host_requests++; paulo@0: } paulo@0: paulo@0: if (ret) paulo@0: { paulo@0: GT->DBGFN (GT, "hitting web cache [total cache hits %u] " paulo@0: "(cache: http://%s/%s)", cache_hits, paulo@0: host_name, STRING_NOTNULL(remote_path)); paulo@0: paulo@0: cache_hits++; paulo@0: need_sync = TRUE; paulo@0: paulo@0: /* reset the atime for the cache */ paulo@0: insert_webcache (host_name, remote_path, now); paulo@0: } paulo@0: paulo@0: free (host_name); paulo@0: free (remote_path); paulo@0: } paulo@0: paulo@0: /* only sync when we successfully accessed a cache */ paulo@0: if (need_sync) paulo@0: file_cache_sync (web_caches); paulo@0: } paulo@0: paulo@0: static BOOL webcache_update (void *udata) paulo@0: { paulo@0: char *webcache_file; paulo@0: int web_exists; paulo@0: time_t now; paulo@0: size_t nodes_len; paulo@0: struct stat st; paulo@0: paulo@0: if (GNUTELLA_LOCAL_MODE) paulo@0: return TRUE; paulo@0: paulo@0: now = time (NULL); paulo@0: nodes_len = gt_conn_length (GT_NODE_NONE, GT_NODE_ANY); paulo@0: paulo@0: /* paulo@0: * If we've already accessed the caches successfully, we won't paulo@0: * allow another access to go through, _unless_ the node list paulo@0: * is small enough, in which case it could be we really do need paulo@0: * to access the caches. paulo@0: */ paulo@0: if (now < next_atime && nodes_len >= 20) paulo@0: return FALSE; paulo@0: paulo@0: webcache_file = STRDUP (gift_conf_path ("Gnutella/gwebcaches")); paulo@0: web_exists = file_stat (webcache_file, &st); paulo@0: paulo@0: if (!web_exists) paulo@0: { paulo@0: GIFT_ERROR (("gwebcaches file doesn't exist")); paulo@0: return FALSE; paulo@0: } paulo@0: paulo@0: /* paulo@0: * next_atime, the absolute next time we allow ourselves to contact the paulo@0: * caches, gets set when we sucessfully access the caches, and if we paulo@0: * manage to get some hosts from a cache we access in an exponentially paulo@0: * decreasing interval. paulo@0: */ paulo@0: access_gwebcaches (); paulo@0: paulo@0: free (webcache_file); paulo@0: return TRUE; paulo@0: } paulo@0: paulo@0: /*****************************************************************************/ paulo@0: paulo@0: void gt_web_cache_update (void) paulo@0: { paulo@0: webcache_update (NULL); paulo@0: } paulo@0: paulo@0: BOOL gt_web_cache_init (void) paulo@0: { paulo@0: /* paulo@0: * Copy the gwebcaches file to from the data dir to paulo@0: * ~/.giFT/Gnutella if it is newer or if ~/.giFT/Gnutella/gwebcaches paulo@0: * doesn't exist. paulo@0: */ paulo@0: gt_config_load_file ("Gnutella/gwebcaches", TRUE, FALSE); paulo@0: paulo@0: web_caches = file_cache_new (gift_conf_path ("Gnutella/gwebcaches")); paulo@0: bad_caches = file_cache_new (gift_conf_path ("Gnutella/bad_gwebcaches")); paulo@0: paulo@0: if (!web_caches) paulo@0: return FALSE; paulo@0: paulo@0: return TRUE; paulo@0: } paulo@0: paulo@0: void gt_web_cache_cleanup (void) paulo@0: { paulo@0: file_cache_free (web_caches); paulo@0: web_caches = NULL; paulo@0: paulo@0: file_cache_free (bad_caches); paulo@0: bad_caches = NULL; paulo@0: paulo@0: cache_hits = 0; paulo@0: next_atime = 0; paulo@0: paulo@0: checking_caches = FALSE; paulo@0: }