annotate src/gt_web_cache.c @ 0:d39e1d0d75b6

initial add
author paulo@hit-nxdomain.opendns.com
date Sat, 20 Feb 2010 21:18:28 -0800
parents
children
rev   line source
paulo@0 1 /*
paulo@0 2 * $Id: gt_web_cache.c,v 1.65 2006/08/06 16:53:36 hexwab Exp $
paulo@0 3 *
paulo@0 4 * Copyright (C) 2001-2003 giFT project (gift.sourceforge.net)
paulo@0 5 *
paulo@0 6 * This program is free software; you can redistribute it and/or modify it
paulo@0 7 * under the terms of the GNU General Public License as published by the
paulo@0 8 * Free Software Foundation; either version 2, or (at your option) any
paulo@0 9 * later version.
paulo@0 10 *
paulo@0 11 * This program is distributed in the hope that it will be useful, but
paulo@0 12 * WITHOUT ANY WARRANTY; without even the implied warranty of
paulo@0 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
paulo@0 14 * General Public License for more details.
paulo@0 15 */
paulo@0 16
paulo@0 17 #include "gt_gnutella.h"
paulo@0 18
paulo@0 19 #include "file_cache.h"
paulo@0 20 #include "http_request.h"
paulo@0 21
paulo@0 22 #include "gt_connect.h"
paulo@0 23 #include "gt_node.h"
paulo@0 24 #include "gt_node_list.h"
paulo@0 25 #include "gt_netorg.h"
paulo@0 26
paulo@0 27 #include "gt_web_cache.h"
paulo@0 28 #include "gt_conf.h"
paulo@0 29
paulo@0 30 #include "dns.h"
paulo@0 31
paulo@0 32 /*****************************************************************************/
paulo@0 33
paulo@0 34 /* minimum time to wait before reconnecting to a webcache */
paulo@0 35 #define CACHE_RETRY_TIME (8 * EHOURS)
paulo@0 36
paulo@0 37 /*****************************************************************************/
paulo@0 38
paulo@0 39 /* number of times we have hit some gwebcaches */
paulo@0 40 static int cache_hits;
paulo@0 41
paulo@0 42 /* the absolute next time we will allow ourselves to access a cache */
paulo@0 43 static time_t next_atime;
paulo@0 44
paulo@0 45 /* amount of time to layoff the caches once we've received some data */
paulo@0 46 static time_t backoff_time = 1 * EHOURS;
paulo@0 47
paulo@0 48 /* holds all the caches */
paulo@0 49 static FileCache *web_caches;
paulo@0 50
paulo@0 51 /* proxy server to contact */
paulo@0 52 static char *gt_proxy_server;
paulo@0 53
paulo@0 54 /* webcaches that succeeded connecting, but returned errors or an
paulo@0 55 * unparseable response */
paulo@0 56 static FileCache *bad_caches;
paulo@0 57
paulo@0 58 /* whether we are in the process of checking the caches */
paulo@0 59 static BOOL checking_caches;
paulo@0 60
paulo@0 61 /*****************************************************************************/
paulo@0 62
paulo@0 63 static void parse_hostfile_response (HttpRequest *http_req, char *hosts_file);
paulo@0 64 static void parse_urlfile_response (HttpRequest *http_req, char *url_file);
paulo@0 65
paulo@0 66 /*****************************************************************************/
paulo@0 67
paulo@0 68 /* parse the extended data in the webcaches file, now its just mtime */
paulo@0 69 static BOOL parse_web_cache_value (char *value, time_t *r_atime)
paulo@0 70 {
paulo@0 71 time_t atime;
paulo@0 72
paulo@0 73 if ((atime = ATOUL (value)) == (unsigned long) -1)
paulo@0 74 atime = 0;
paulo@0 75
paulo@0 76 if (r_atime)
paulo@0 77 *r_atime = atime;
paulo@0 78
paulo@0 79 return TRUE;
paulo@0 80 }
paulo@0 81
paulo@0 82 /*****************************************************************************/
paulo@0 83
paulo@0 84 static char *new_webcache_url (const char *host, const char *path)
paulo@0 85 {
paulo@0 86 return stringf_dup ("http://%s/%s", host, STRING_NOTNULL(path));
paulo@0 87 }
paulo@0 88
paulo@0 89 static void ban_webcache (HttpRequest *req, const char *why)
paulo@0 90 {
paulo@0 91 char *url;
paulo@0 92
paulo@0 93 url = new_webcache_url (req->host, req->path);
paulo@0 94 GT->dbg (GT, "banning webcache %s", url);
paulo@0 95
paulo@0 96 file_cache_insert (bad_caches, url, why);
paulo@0 97 file_cache_sync (bad_caches);
paulo@0 98
paulo@0 99 free (url);
paulo@0 100 }
paulo@0 101
paulo@0 102 static void insert_webcache (const char *host_name, const char *remote_path,
paulo@0 103 time_t atime)
paulo@0 104 {
paulo@0 105 char *url;
paulo@0 106 char *field;
paulo@0 107
paulo@0 108 url = new_webcache_url (host_name, remote_path);
paulo@0 109 field = stringf_dup ("%lu", atime);
paulo@0 110
paulo@0 111 file_cache_insert (web_caches, url, field);
paulo@0 112
paulo@0 113 free (url);
paulo@0 114 free (field);
paulo@0 115 }
paulo@0 116
paulo@0 117 /*****************************************************************************/
paulo@0 118
paulo@0 119 static void handle_close_request (HttpRequest *req, int error_code)
paulo@0 120 {
paulo@0 121 String *s;
paulo@0 122
paulo@0 123 if (error_code < 0 || error_code < 200 || error_code >= 300)
paulo@0 124 {
paulo@0 125 if (error_code == -1)
paulo@0 126 {
paulo@0 127 /* the error was our fault, out of mem, etc. dont do anything */
paulo@0 128 GT->DBGFN (GT, "connect to server %s failed for some reason",
paulo@0 129 req->host);
paulo@0 130 }
paulo@0 131 else
paulo@0 132 {
paulo@0 133 char err[32];
paulo@0 134
paulo@0 135 snprintf (err, sizeof(err), "Received error %d", error_code);
paulo@0 136
paulo@0 137 /*
paulo@0 138 * Not found, internal server error, or too many redirects: ban
paulo@0 139 * the server's URL
paulo@0 140 */
paulo@0 141 GT->DBGFN (GT, "server %s returned error %i", req->host,
paulo@0 142 error_code);
paulo@0 143 ban_webcache (req, err);
paulo@0 144 }
paulo@0 145 }
paulo@0 146
paulo@0 147 /* TODO: this assumes this is the one hostfile request flying around,
paulo@0 148 * and not a urlfile request, which probably needs to be handled
paulo@0 149 * separately */
paulo@0 150 checking_caches = FALSE;
paulo@0 151
paulo@0 152 if ((s = req->data))
paulo@0 153 string_free (s);
paulo@0 154 }
paulo@0 155
paulo@0 156 static void parse_hostfile_response (HttpRequest *http_req, char *host_file)
paulo@0 157 {
paulo@0 158 int hosts = 0;
paulo@0 159 GtNode *node;
paulo@0 160 time_t now;
paulo@0 161
paulo@0 162 if (!host_file)
paulo@0 163 {
paulo@0 164 GT->DBGFN (GT, "empty host file from %s", http_req->host);
paulo@0 165 return;
paulo@0 166 }
paulo@0 167
paulo@0 168 GT->DBGFN (GT, "hostfile from server = %s", host_file);
paulo@0 169
paulo@0 170 now = time (NULL);
paulo@0 171
paulo@0 172 /*
paulo@0 173 * If the response start with "ERROR: " (or pseudo-html '<' char), ban the
paulo@0 174 * webcache.
paulo@0 175 */
paulo@0 176 if (!strncasecmp (host_file, "ERROR", sizeof ("ERROR") - 1) ||
paulo@0 177 host_file[0] == '<')
paulo@0 178 {
paulo@0 179 ban_webcache (http_req, "Malformed response content");
paulo@0 180 return;
paulo@0 181 }
paulo@0 182
paulo@0 183 while (host_file && *host_file)
paulo@0 184 {
paulo@0 185 char *host;
paulo@0 186 in_addr_t ip;
paulo@0 187 in_port_t port;
paulo@0 188
paulo@0 189 host = string_sep_set (&host_file, "\r\n");
paulo@0 190
paulo@0 191 ip = net_ip (string_sep (&host, ":"));
paulo@0 192 port = ATOI (host);
paulo@0 193
paulo@0 194 if (!port || !ip || ip == INADDR_NONE)
paulo@0 195 continue;
paulo@0 196
paulo@0 197 GT->DBGFN (GT, "registering %s:%hu (from cache %s)", net_ip_str (ip),
paulo@0 198 port, http_req->host);
paulo@0 199
paulo@0 200 /* register the hosts as ultrapeers */
paulo@0 201 node = gt_node_register (ip, port, GT_NODE_ULTRA);
paulo@0 202 hosts++;
paulo@0 203
paulo@0 204 if (!node)
paulo@0 205 continue;
paulo@0 206
paulo@0 207 /* set the vitality on this node to preserve it across restarts */
paulo@0 208 node->vitality = now;
paulo@0 209
paulo@0 210 /* might be connected already */
paulo@0 211 if (node->state != GT_NODE_DISCONNECTED)
paulo@0 212 continue;
paulo@0 213
paulo@0 214 /* try to connect to the first 5 */
paulo@0 215 if (hosts <= 5 && gt_conn_need_connections (GT_NODE_ULTRA))
paulo@0 216 gt_connect (node);
paulo@0 217
paulo@0 218 /* don't allow the cache to register an infinite number of hosts */
paulo@0 219 if (hosts >= 50)
paulo@0 220 break;
paulo@0 221 }
paulo@0 222
paulo@0 223 /* save the nodes we added to disk so we dont hit the caches again */
paulo@0 224 gt_node_list_save ();
paulo@0 225
paulo@0 226 /*
paulo@0 227 * Do an exponential backoff from the caches. If we were online and
paulo@0 228 * able to receive data, we should be getting node information
paulo@0 229 * some other way now.
paulo@0 230 */
paulo@0 231 if (hosts >= 5)
paulo@0 232 {
paulo@0 233 next_atime = now + backoff_time;
paulo@0 234 backoff_time *= 2;
paulo@0 235 }
paulo@0 236 }
paulo@0 237
paulo@0 238 static void parse_urlfile_response (HttpRequest *http_req, char *url_file)
paulo@0 239 {
paulo@0 240 int caches = 0;
paulo@0 241
paulo@0 242 if (!url_file)
paulo@0 243 {
paulo@0 244 GT->DBGFN (GT, "empty url file from %s", http_req->host);
paulo@0 245 return;
paulo@0 246 }
paulo@0 247
paulo@0 248 GT->DBGFN (GT, "urlfile from server = %s", url_file);
paulo@0 249
paulo@0 250 while (url_file && *url_file)
paulo@0 251 {
paulo@0 252 char *url;
paulo@0 253 char *host_name;
paulo@0 254 char *remote_path;
paulo@0 255
paulo@0 256 url = string_sep_set (&url_file, "\r\n");
paulo@0 257
paulo@0 258 /* skip past http:// */
paulo@0 259 string_sep (&url, "http://");
paulo@0 260
paulo@0 261 host_name = string_sep (&url, "/");
paulo@0 262 remote_path = url;
paulo@0 263
paulo@0 264 /* NOTE: remote_path is possibly empty */
paulo@0 265 if (!host_name)
paulo@0 266 continue;
paulo@0 267
paulo@0 268 url = stringf ("http://%s/%s", host_name, STRING_NOTNULL(remote_path));
paulo@0 269
paulo@0 270 /* if the webcache is already in our db, skip it */
paulo@0 271 if (file_cache_lookup (web_caches, url))
paulo@0 272 continue;
paulo@0 273
paulo@0 274 /*
paulo@0 275 * Only allow caches to register two more caches: this
paulo@0 276 * small number helps to avoid our list of caches getting
paulo@0 277 * polluted.
paulo@0 278 */
paulo@0 279 if (++caches > 2)
paulo@0 280 break;
paulo@0 281
paulo@0 282 /* format is: <url> <last time visited> */
paulo@0 283 file_cache_insert (web_caches, url, "0");
paulo@0 284 }
paulo@0 285
paulo@0 286 /* sync the pending web caches to disk */
paulo@0 287 file_cache_sync (web_caches);
paulo@0 288 }
paulo@0 289
paulo@0 290 static void end_request (HttpRequest *req, char *data)
paulo@0 291 {
paulo@0 292 char *str = req->request;
paulo@0 293
paulo@0 294 if (str && !strncmp (str, "hostfile", strlen ("hostfile")))
paulo@0 295 parse_hostfile_response (req, data);
paulo@0 296 else if (str && !strncmp (str, "urlfile", strlen ("urlfile")))
paulo@0 297 parse_urlfile_response (req, data);
paulo@0 298 else
paulo@0 299 abort ();
paulo@0 300 }
paulo@0 301
paulo@0 302 /*****************************************************************************/
paulo@0 303
paulo@0 304 /*
paulo@0 305 * Return TRUE if newname is in the same domain as oldname. For example,
paulo@0 306 * "new.gwc.example.com", "example.com", and "cache.example.com" are all
paulo@0 307 * considered in the same domain as "www.example.com".
paulo@0 308 *
paulo@0 309 * This is called on redirects, to make sure the cache can't redirect to an
paulo@0 310 * innocent site as part of a DDoS attack.
paulo@0 311 */
paulo@0 312 static BOOL in_same_domain (const char *oldname, const char *newname)
paulo@0 313 {
paulo@0 314 return FALSE;
paulo@0 315 #if 0
paulo@0 316 const char *p;
paulo@0 317 const char *largest = NULL;
paulo@0 318 int periods = 0;
paulo@0 319
paulo@0 320 p = newname;
paulo@0 321
paulo@0 322 /* get the largest common substring */
paulo@0 323 while (p != NULL)
paulo@0 324 {
paulo@0 325 if ((largest = strstr (oldname, p)))
paulo@0 326 break;
paulo@0 327
paulo@0 328 /* advance to next domain part */
paulo@0 329 p = strchr (p + 1, '.');
paulo@0 330 }
paulo@0 331
paulo@0 332 if (!largest)
paulo@0 333 return FALSE;
paulo@0 334
paulo@0 335 /*
paulo@0 336 * Make sure the substring matches completely to the end. This will
paulo@0 337 * actually fail when it shouldn't if one name includes the '.' toplevel
paulo@0 338 * domain and one doesn't. Oh well.
paulo@0 339 */
paulo@0 340 if (strcmp (largest, p) != 0)
paulo@0 341 return FALSE;
paulo@0 342
paulo@0 343 /*
paulo@0 344 * Count the number of periods to find the number of subdomains in the
paulo@0 345 * largest common substring.
paulo@0 346 */
paulo@0 347 for (p = largest; *p != 0; p++)
paulo@0 348 {
paulo@0 349 if (*p == '.')
paulo@0 350 periods++;
paulo@0 351 }
paulo@0 352
paulo@0 353 /*
paulo@0 354 * If the last character is the root '.', subtract one, since we are
paulo@0 355 * looking for the number of common subdomains, and the root is shared by
paulo@0 356 * all names.
paulo@0 357 */
paulo@0 358 if (largest[strlen (largest) - 1] == '.')
paulo@0 359 periods--;
paulo@0 360
paulo@0 361 /*
paulo@0 362 * If there are two periods, at least two toplevel domains match.
paulo@0 363 */
paulo@0 364 if (periods >= 2)
paulo@0 365 return TRUE;
paulo@0 366
paulo@0 367 /*
paulo@0 368 * If there is only one period shared, the names MAY be in the same
paulo@0 369 * domain: one of the names has to be completely contained within the
paulo@0 370 * other, such as the case of "foo.example.com" and "example.com".
paulo@0 371 */
paulo@0 372 if (periods == 1 &&
paulo@0 373 (strcmp (largest, oldname) == 0 || strcmp (largest, newname) == 0))
paulo@0 374 {
paulo@0 375 return TRUE;
paulo@0 376 }
paulo@0 377
paulo@0 378 /* not in same domain */
paulo@0 379 return FALSE;
paulo@0 380 #endif
paulo@0 381 }
paulo@0 382
paulo@0 383 /*
paulo@0 384 * Called to when the webcache sends a 300-level response with a provided
paulo@0 385 * Location: header. Have to make sure the domain the cache directs us
paulo@0 386 * to is the same.
paulo@0 387 */
paulo@0 388 static BOOL handle_redirect (HttpRequest *req, const char *new_host,
paulo@0 389 const char *new_path)
paulo@0 390 {
paulo@0 391 assert (new_host != NULL);
paulo@0 392
paulo@0 393 if (in_same_domain (req->host, new_host) == FALSE)
paulo@0 394 return FALSE;
paulo@0 395
paulo@0 396 /* might want to do something else if the ban list later becomes per host
paulo@0 397 * rather than per URL */
paulo@0 398 ban_webcache (req, "Redirected");
paulo@0 399
paulo@0 400 GT->DBGFN (GT, "Redirecting to new webcache %s/%s", new_host, new_path);
paulo@0 401
paulo@0 402 insert_webcache (new_host, new_path, time (NULL));
paulo@0 403 file_cache_sync (web_caches);
paulo@0 404
paulo@0 405 return TRUE;
paulo@0 406 }
paulo@0 407
paulo@0 408 /*****************************************************************************/
paulo@0 409
paulo@0 410 static BOOL handle_recv (HttpRequest *req, char *data, size_t len)
paulo@0 411 {
paulo@0 412 String *s;
paulo@0 413
paulo@0 414 /* EOF */
paulo@0 415 if (!data)
paulo@0 416 {
paulo@0 417 char *str = NULL;
paulo@0 418
paulo@0 419 if ((s = req->data))
paulo@0 420 str = s->str;
paulo@0 421
paulo@0 422 GT->DBGFN (GT, "read %s from server %s", str, req->host);
paulo@0 423 end_request (req, str);
paulo@0 424
paulo@0 425 /* clear data link */
paulo@0 426 req->data = NULL;
paulo@0 427
paulo@0 428 return TRUE;
paulo@0 429 }
paulo@0 430
paulo@0 431 if (!len)
paulo@0 432 return TRUE;
paulo@0 433
paulo@0 434 GT->DBGFN (GT, "server sent us: %s", data);
paulo@0 435
paulo@0 436 if (!(s = req->data) && !(s = req->data = string_new (NULL, 0, 0, TRUE)))
paulo@0 437 return FALSE;
paulo@0 438
paulo@0 439 if (string_append (s, data) != len)
paulo@0 440 {
paulo@0 441 GT->DBGFN (GT, "string append failed");
paulo@0 442 return FALSE;
paulo@0 443 }
paulo@0 444
paulo@0 445 return TRUE;
paulo@0 446 }
paulo@0 447
paulo@0 448 /*****************************************************************************/
paulo@0 449
paulo@0 450 static BOOL handle_add_headers (HttpRequest *req, Dataset **headers)
paulo@0 451 {
paulo@0 452 /* don't let intermediaries cache our request, I think */
paulo@0 453 dataset_insertstr (headers, "Cache-Control", "no-cache");
paulo@0 454
paulo@0 455 return TRUE;
paulo@0 456 }
paulo@0 457
paulo@0 458 /*****************************************************************************/
paulo@0 459
paulo@0 460 static BOOL parse_host_and_port (char **r_host, in_port_t *r_port)
paulo@0 461 {
paulo@0 462 char *str;
paulo@0 463 char *host;
paulo@0 464 long port;
paulo@0 465
paulo@0 466 str = *r_host;
paulo@0 467
paulo@0 468 if (r_port)
paulo@0 469 *r_port = 80;
paulo@0 470
paulo@0 471 /* skip leading 'http://' if found */
paulo@0 472 if (strstr (str, "http://"))
paulo@0 473 str += strlen ("http://");
paulo@0 474
paulo@0 475 host = string_sep (&str, ":");
paulo@0 476
paulo@0 477 if (!host)
paulo@0 478 return FALSE;
paulo@0 479
paulo@0 480 *r_host = host;
paulo@0 481
paulo@0 482 if (str && !string_isempty (str))
paulo@0 483 {
paulo@0 484 port = gift_strtol (str);
paulo@0 485
paulo@0 486 /* make sure port is valid */
paulo@0 487 if (port <= 0 || port >= 65536)
paulo@0 488 return FALSE;
paulo@0 489
paulo@0 490 *r_port = port;
paulo@0 491 }
paulo@0 492
paulo@0 493 return TRUE;
paulo@0 494 }
paulo@0 495
paulo@0 496 static TCPC *open_http_connection (HttpRequest *req, const char *http_name)
paulo@0 497 {
paulo@0 498 in_addr_t ip;
paulo@0 499 in_port_t port;
paulo@0 500 char *str;
paulo@0 501 char *name;
paulo@0 502 TCPC *c;
paulo@0 503 struct hostent *host;
paulo@0 504
paulo@0 505 if (!http_name)
paulo@0 506 return NULL;
paulo@0 507
paulo@0 508 if (!(str = STRDUP (http_name)))
paulo@0 509 return NULL;
paulo@0 510
paulo@0 511 name = str;
paulo@0 512
paulo@0 513 if (!parse_host_and_port (&name, &port))
paulo@0 514 {
paulo@0 515 GT->DBGFN (GT, "error parsing hostname \"%s\"", str);
paulo@0 516 free (str);
paulo@0 517 return NULL;
paulo@0 518 }
paulo@0 519
paulo@0 520 if (!(host = gt_dns_lookup (name)))
paulo@0 521 {
paulo@0 522 free (str);
paulo@0 523 return NULL;
paulo@0 524 }
paulo@0 525
paulo@0 526 /* ip is in network-order already */
paulo@0 527 memcpy (&ip, host->h_addr, MIN (host->h_length, sizeof (ip)));
paulo@0 528
paulo@0 529 if (net_match_host (ip, "LOCAL"))
paulo@0 530 {
paulo@0 531 free (str);
paulo@0 532 ban_webcache (req, "Resolved to local IP");
paulo@0 533 return NULL;
paulo@0 534 }
paulo@0 535
paulo@0 536 c = tcp_open (ip, port, FALSE);
paulo@0 537 if (!c)
paulo@0 538 {
paulo@0 539 GT->DBGFN (GT, "couldn't open connection to %s [%s]: %s",
paulo@0 540 http_name, net_ip_str (ip), GIFT_NETERROR());
paulo@0 541 }
paulo@0 542
paulo@0 543 free (str);
paulo@0 544 return c;
paulo@0 545 }
paulo@0 546
paulo@0 547 /* return the name we have to lookup */
paulo@0 548 static char *get_http_name (char *name)
paulo@0 549 {
paulo@0 550 char *proxy;
paulo@0 551 char *host;
paulo@0 552
paulo@0 553 host = name;
paulo@0 554 proxy = HTTP_PROXY;
paulo@0 555
paulo@0 556 string_trim (proxy);
paulo@0 557
paulo@0 558 if (proxy && !string_isempty (proxy))
paulo@0 559 {
paulo@0 560 /* connect to the proxy instead */
paulo@0 561 if (STRCMP (proxy, gt_proxy_server) != 0)
paulo@0 562 {
paulo@0 563 GT->DBGFN (GT, "using proxy server %s", proxy);
paulo@0 564 free (gt_proxy_server);
paulo@0 565 gt_proxy_server = STRDUP (proxy);
paulo@0 566 }
paulo@0 567
paulo@0 568 host = proxy;
paulo@0 569 }
paulo@0 570
paulo@0 571 return host;
paulo@0 572 }
paulo@0 573
paulo@0 574 static void check_dns_error (const char *name, HttpRequest *req)
paulo@0 575 {
paulo@0 576 int error;
paulo@0 577
paulo@0 578 error = gt_dns_get_errno ();
paulo@0 579
paulo@0 580 if (!error)
paulo@0 581 return;
paulo@0 582
paulo@0 583 GT->DBGFN (GT, "lookup failed on \"%s\": %s", name, gt_dns_strerror(error));
paulo@0 584
paulo@0 585 /* ban the host, but only if not using a proxy server */
paulo@0 586 if (error == HOST_NOT_FOUND && gt_proxy_server == NULL)
paulo@0 587 {
paulo@0 588 GT->DBGFN (GT, "webcache \"%s\" not in DNS. banning", name);
paulo@0 589 ban_webcache (req, "Host not found in DNS");
paulo@0 590 return;
paulo@0 591 }
paulo@0 592 }
paulo@0 593
paulo@0 594 static BOOL make_request (char *host_name, char *remote_path, char *request)
paulo@0 595 {
paulo@0 596 HttpRequest *req;
paulo@0 597 TCPC *c;
paulo@0 598 char *resolve_name;
paulo@0 599 char *url;
paulo@0 600
paulo@0 601 url = stringf_dup ("http://%s/%s", host_name, STRING_NOTNULL(remote_path));
paulo@0 602
paulo@0 603 if (!(req = gt_http_request_new (url, request)))
paulo@0 604 {
paulo@0 605 free (url);
paulo@0 606 return FALSE;
paulo@0 607 }
paulo@0 608
paulo@0 609 free (url);
paulo@0 610
paulo@0 611 resolve_name = get_http_name (host_name);
paulo@0 612
paulo@0 613 gt_dns_set_errno (0);
paulo@0 614
paulo@0 615 if (!(c = open_http_connection (req, resolve_name)))
paulo@0 616 {
paulo@0 617 check_dns_error (resolve_name, req);
paulo@0 618 gt_http_request_close (req, -1);
paulo@0 619 return FALSE;
paulo@0 620 }
paulo@0 621
paulo@0 622 GT->DBGFN (GT, "opening connection to %s [%s]",
paulo@0 623 resolve_name, net_ip_str (c->host));
paulo@0 624
paulo@0 625 req->recv_func = handle_recv;
paulo@0 626 req->add_header_func = handle_add_headers;
paulo@0 627 req->close_req_func = handle_close_request;
paulo@0 628 req->redirect_func = handle_redirect;
paulo@0 629
paulo@0 630 gt_http_request_set_conn (req, c); /* setup references */
paulo@0 631 gt_http_request_set_proxy (req, gt_proxy_server); /* maybe use proxy */
paulo@0 632 gt_http_request_set_timeout (req, 2 * MINUTES); /* don't wait forever */
paulo@0 633 gt_http_request_set_max_len (req, 65536); /* don't read forever */
paulo@0 634
paulo@0 635 input_add (c->fd, c, INPUT_WRITE,
paulo@0 636 (InputCallback)gt_http_request_handle, TIMEOUT_DEF);
paulo@0 637
paulo@0 638 return TRUE;
paulo@0 639 }
paulo@0 640
paulo@0 641 /*****************************************************************************/
paulo@0 642
paulo@0 643 struct find_rand_args
paulo@0 644 {
paulo@0 645 int n;
paulo@0 646 time_t now;
paulo@0 647 char *url;
paulo@0 648 char *field;
paulo@0 649 };
paulo@0 650
paulo@0 651 /* get a random cache from the webcaches dataset */
paulo@0 652 static void foreach_rand_cache (ds_data_t *key, ds_data_t *value,
paulo@0 653 struct find_rand_args *args)
paulo@0 654 {
paulo@0 655 time_t atime;
paulo@0 656 float range = args->n;
paulo@0 657 char *str;
paulo@0 658 char *url = key->data;
paulo@0 659 char *hostname, *path;
paulo@0 660 int ret;
paulo@0 661
paulo@0 662 if (!parse_web_cache_value (value->data, &atime))
paulo@0 663 return;
paulo@0 664
paulo@0 665 /* skip the cache entirely if we've retried too soon */
paulo@0 666 if (args->now - atime < CACHE_RETRY_TIME)
paulo@0 667 return;
paulo@0 668
paulo@0 669 /*
paulo@0 670 * Make sure the cache has a parseable url
paulo@0 671 *
paulo@0 672 * TODO: This is ugly, it really should be parsed into a
paulo@0 673 * a data structure once instead.
paulo@0 674 */
paulo@0 675 str = STRDUP (url);
paulo@0 676 ret = gt_http_url_parse (str, &hostname, &path);
paulo@0 677 free (str);
paulo@0 678
paulo@0 679 if (!ret)
paulo@0 680 {
paulo@0 681 GT->warn (GT, "bad webcache url \"%s\" from %s/gwebcaches",
paulo@0 682 key->data, gift_conf_path ("Gnutella"));
paulo@0 683 return;
paulo@0 684 }
paulo@0 685
paulo@0 686 /* decrease probability of selecting the next web cache */
paulo@0 687 args->n++;
paulo@0 688
paulo@0 689 /*
paulo@0 690 * Select this webcache with probability 1/n.
paulo@0 691 *
paulo@0 692 * Also select this cache if we haven't chosen one yet, which may be the
paulo@0 693 * case on if the index of the cache is > 0 when there are banned caches.
paulo@0 694 */
paulo@0 695 if (args->url == NULL ||
paulo@0 696 range * rand() / (RAND_MAX + 1.0) < 1.0)
paulo@0 697 {
paulo@0 698 char *keystr = key->data;
paulo@0 699 char *valuestr = value->data;
paulo@0 700
paulo@0 701 /* check if this is a bad gwebcache */
paulo@0 702 if (file_cache_lookup (bad_caches, url))
paulo@0 703 {
paulo@0 704 #if 1
paulo@0 705 GT->warn (GT, "skipping webcache %s, in bad gwebcaches", url);
paulo@0 706 #endif
paulo@0 707 /* pretend we didn't select this to ensure equal distribution */
paulo@0 708 args->n--;
paulo@0 709
paulo@0 710 return;
paulo@0 711 }
paulo@0 712
paulo@0 713 /* free the old values */
paulo@0 714 free (args->url);
paulo@0 715 free (args->field);
paulo@0 716
paulo@0 717 args->url = STRDUP (keystr);
paulo@0 718 args->field = STRDUP (valuestr);
paulo@0 719 }
paulo@0 720 }
paulo@0 721
paulo@0 722 static BOOL get_random_cache (time_t now, char **r_host_name,
paulo@0 723 char **r_remote_path)
paulo@0 724 {
paulo@0 725 int ret;
paulo@0 726 struct find_rand_args args;
paulo@0 727
paulo@0 728 args.n = 1; /* initial probability */
paulo@0 729 args.now = now; /* current time */
paulo@0 730 args.url = NULL;
paulo@0 731 args.field = NULL;
paulo@0 732
paulo@0 733 dataset_foreach (web_caches->d, DS_FOREACH(foreach_rand_cache), &args);
paulo@0 734
paulo@0 735 if (!args.url)
paulo@0 736 {
paulo@0 737 GT->DBGFN (GT, "couldn't find random cache");
paulo@0 738 return FALSE;
paulo@0 739 }
paulo@0 740
paulo@0 741 ret = gt_http_url_parse (args.url, r_host_name, r_remote_path);
paulo@0 742
paulo@0 743 if (!*r_host_name || !*r_remote_path)
paulo@0 744 {
paulo@0 745 free (args.url);
paulo@0 746 free (args.field);
paulo@0 747 return FALSE;
paulo@0 748 }
paulo@0 749
paulo@0 750 *r_host_name = STRDUP (*r_host_name);
paulo@0 751 *r_remote_path = STRDUP (*r_remote_path);
paulo@0 752
paulo@0 753 /* free the original buffer */
paulo@0 754 free (args.url);
paulo@0 755 free (args.field);
paulo@0 756
paulo@0 757 return ret;
paulo@0 758 }
paulo@0 759
paulo@0 760 static void access_gwebcaches (void)
paulo@0 761 {
paulo@0 762 int len;
paulo@0 763 char *host_name;
paulo@0 764 char *remote_path;
paulo@0 765 time_t now;
paulo@0 766 int host_requests = 0;
paulo@0 767 #if 0
paulo@0 768 int url_requests = 0;
paulo@0 769 #endif
paulo@0 770 int max_requests = 1;
paulo@0 771 BOOL ret;
paulo@0 772 BOOL need_sync;
paulo@0 773
paulo@0 774 /*
paulo@0 775 * We may get called while a check of the gwebcaches is already
paulo@0 776 * in progress.
paulo@0 777 */
paulo@0 778 if (checking_caches)
paulo@0 779 {
paulo@0 780 GT->DBGFN (GT, "Access already in progress");
paulo@0 781 return;
paulo@0 782 }
paulo@0 783
paulo@0 784 now = time (NULL);
paulo@0 785
paulo@0 786 len = dataset_length (web_caches->d);
paulo@0 787
paulo@0 788 if (max_requests > len)
paulo@0 789 max_requests = len;
paulo@0 790
paulo@0 791 need_sync = FALSE;
paulo@0 792
paulo@0 793 while (host_requests < max_requests)
paulo@0 794 {
paulo@0 795 if (!get_random_cache (now, &host_name, &remote_path))
paulo@0 796 {
paulo@0 797 GT->DBGFN (GT, "error looking up cache");
paulo@0 798 break;
paulo@0 799 }
paulo@0 800
paulo@0 801 #if 0
paulo@0 802 /* make a url request sometimes to keep the cache file up to date, but
paulo@0 803 * mostly ask for hosts */
paulo@0 804 if (10.0 * rand() / (RAND_MAX + 1.0) < 1.0)
paulo@0 805 {
paulo@0 806 ret = make_request (host_name, remote_path,
paulo@0 807 "urlfile=1&client=GIFT&version=" GT_VERSION);
paulo@0 808 url_requests++;
paulo@0 809 }
paulo@0 810 else
paulo@0 811 #endif
paulo@0 812 {
paulo@0 813 ret = make_request (host_name, remote_path,
paulo@0 814 "hostfile=1&client=GIFT&version=" GT_VERSION);
paulo@0 815
paulo@0 816 if (ret)
paulo@0 817 checking_caches = TRUE;
paulo@0 818
paulo@0 819 host_requests++;
paulo@0 820 }
paulo@0 821
paulo@0 822 if (ret)
paulo@0 823 {
paulo@0 824 GT->DBGFN (GT, "hitting web cache [total cache hits %u] "
paulo@0 825 "(cache: http://%s/%s)", cache_hits,
paulo@0 826 host_name, STRING_NOTNULL(remote_path));
paulo@0 827
paulo@0 828 cache_hits++;
paulo@0 829 need_sync = TRUE;
paulo@0 830
paulo@0 831 /* reset the atime for the cache */
paulo@0 832 insert_webcache (host_name, remote_path, now);
paulo@0 833 }
paulo@0 834
paulo@0 835 free (host_name);
paulo@0 836 free (remote_path);
paulo@0 837 }
paulo@0 838
paulo@0 839 /* only sync when we successfully accessed a cache */
paulo@0 840 if (need_sync)
paulo@0 841 file_cache_sync (web_caches);
paulo@0 842 }
paulo@0 843
paulo@0 844 static BOOL webcache_update (void *udata)
paulo@0 845 {
paulo@0 846 char *webcache_file;
paulo@0 847 int web_exists;
paulo@0 848 time_t now;
paulo@0 849 size_t nodes_len;
paulo@0 850 struct stat st;
paulo@0 851
paulo@0 852 if (GNUTELLA_LOCAL_MODE)
paulo@0 853 return TRUE;
paulo@0 854
paulo@0 855 now = time (NULL);
paulo@0 856 nodes_len = gt_conn_length (GT_NODE_NONE, GT_NODE_ANY);
paulo@0 857
paulo@0 858 /*
paulo@0 859 * If we've already accessed the caches successfully, we won't
paulo@0 860 * allow another access to go through, _unless_ the node list
paulo@0 861 * is small enough, in which case it could be we really do need
paulo@0 862 * to access the caches.
paulo@0 863 */
paulo@0 864 if (now < next_atime && nodes_len >= 20)
paulo@0 865 return FALSE;
paulo@0 866
paulo@0 867 webcache_file = STRDUP (gift_conf_path ("Gnutella/gwebcaches"));
paulo@0 868 web_exists = file_stat (webcache_file, &st);
paulo@0 869
paulo@0 870 if (!web_exists)
paulo@0 871 {
paulo@0 872 GIFT_ERROR (("gwebcaches file doesn't exist"));
paulo@0 873 return FALSE;
paulo@0 874 }
paulo@0 875
paulo@0 876 /*
paulo@0 877 * next_atime, the absolute next time we allow ourselves to contact the
paulo@0 878 * caches, gets set when we sucessfully access the caches, and if we
paulo@0 879 * manage to get some hosts from a cache we access in an exponentially
paulo@0 880 * decreasing interval.
paulo@0 881 */
paulo@0 882 access_gwebcaches ();
paulo@0 883
paulo@0 884 free (webcache_file);
paulo@0 885 return TRUE;
paulo@0 886 }
paulo@0 887
paulo@0 888 /*****************************************************************************/
paulo@0 889
paulo@0 890 void gt_web_cache_update (void)
paulo@0 891 {
paulo@0 892 webcache_update (NULL);
paulo@0 893 }
paulo@0 894
paulo@0 895 BOOL gt_web_cache_init (void)
paulo@0 896 {
paulo@0 897 /*
paulo@0 898 * Copy the gwebcaches file to from the data dir to
paulo@0 899 * ~/.giFT/Gnutella if it is newer or if ~/.giFT/Gnutella/gwebcaches
paulo@0 900 * doesn't exist.
paulo@0 901 */
paulo@0 902 gt_config_load_file ("Gnutella/gwebcaches", TRUE, FALSE);
paulo@0 903
paulo@0 904 web_caches = file_cache_new (gift_conf_path ("Gnutella/gwebcaches"));
paulo@0 905 bad_caches = file_cache_new (gift_conf_path ("Gnutella/bad_gwebcaches"));
paulo@0 906
paulo@0 907 if (!web_caches)
paulo@0 908 return FALSE;
paulo@0 909
paulo@0 910 return TRUE;
paulo@0 911 }
paulo@0 912
paulo@0 913 void gt_web_cache_cleanup (void)
paulo@0 914 {
paulo@0 915 file_cache_free (web_caches);
paulo@0 916 web_caches = NULL;
paulo@0 917
paulo@0 918 file_cache_free (bad_caches);
paulo@0 919 bad_caches = NULL;
paulo@0 920
paulo@0 921 cache_hits = 0;
paulo@0 922 next_atime = 0;
paulo@0 923
paulo@0 924 checking_caches = FALSE;
paulo@0 925 }