rev |
line source |
paulo@0
|
1 /*
|
paulo@0
|
2 * $Id: gt_web_cache.c,v 1.65 2006/08/06 16:53:36 hexwab Exp $
|
paulo@0
|
3 *
|
paulo@0
|
4 * Copyright (C) 2001-2003 giFT project (gift.sourceforge.net)
|
paulo@0
|
5 *
|
paulo@0
|
6 * This program is free software; you can redistribute it and/or modify it
|
paulo@0
|
7 * under the terms of the GNU General Public License as published by the
|
paulo@0
|
8 * Free Software Foundation; either version 2, or (at your option) any
|
paulo@0
|
9 * later version.
|
paulo@0
|
10 *
|
paulo@0
|
11 * This program is distributed in the hope that it will be useful, but
|
paulo@0
|
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
|
paulo@0
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
paulo@0
|
14 * General Public License for more details.
|
paulo@0
|
15 */
|
paulo@0
|
16
|
paulo@0
|
17 #include "gt_gnutella.h"
|
paulo@0
|
18
|
paulo@0
|
19 #include "file_cache.h"
|
paulo@0
|
20 #include "http_request.h"
|
paulo@0
|
21
|
paulo@0
|
22 #include "gt_connect.h"
|
paulo@0
|
23 #include "gt_node.h"
|
paulo@0
|
24 #include "gt_node_list.h"
|
paulo@0
|
25 #include "gt_netorg.h"
|
paulo@0
|
26
|
paulo@0
|
27 #include "gt_web_cache.h"
|
paulo@0
|
28 #include "gt_conf.h"
|
paulo@0
|
29
|
paulo@0
|
30 #include "dns.h"
|
paulo@0
|
31
|
paulo@0
|
32 /*****************************************************************************/
|
paulo@0
|
33
|
paulo@0
|
34 /* minimum time to wait before reconnecting to a webcache */
|
paulo@0
|
35 #define CACHE_RETRY_TIME (8 * EHOURS)
|
paulo@0
|
36
|
paulo@0
|
37 /*****************************************************************************/
|
paulo@0
|
38
|
paulo@0
|
39 /* number of times we have hit some gwebcaches */
|
paulo@0
|
40 static int cache_hits;
|
paulo@0
|
41
|
paulo@0
|
42 /* the absolute next time we will allow ourselves to access a cache */
|
paulo@0
|
43 static time_t next_atime;
|
paulo@0
|
44
|
paulo@0
|
45 /* amount of time to layoff the caches once we've received some data */
|
paulo@0
|
46 static time_t backoff_time = 1 * EHOURS;
|
paulo@0
|
47
|
paulo@0
|
48 /* holds all the caches */
|
paulo@0
|
49 static FileCache *web_caches;
|
paulo@0
|
50
|
paulo@0
|
51 /* proxy server to contact */
|
paulo@0
|
52 static char *gt_proxy_server;
|
paulo@0
|
53
|
paulo@0
|
54 /* webcaches that succeeded connecting, but returned errors or an
|
paulo@0
|
55 * unparseable response */
|
paulo@0
|
56 static FileCache *bad_caches;
|
paulo@0
|
57
|
paulo@0
|
58 /* whether we are in the process of checking the caches */
|
paulo@0
|
59 static BOOL checking_caches;
|
paulo@0
|
60
|
paulo@0
|
61 /*****************************************************************************/
|
paulo@0
|
62
|
paulo@0
|
63 static void parse_hostfile_response (HttpRequest *http_req, char *hosts_file);
|
paulo@0
|
64 static void parse_urlfile_response (HttpRequest *http_req, char *url_file);
|
paulo@0
|
65
|
paulo@0
|
66 /*****************************************************************************/
|
paulo@0
|
67
|
paulo@0
|
68 /* parse the extended data in the webcaches file, now its just mtime */
|
paulo@0
|
69 static BOOL parse_web_cache_value (char *value, time_t *r_atime)
|
paulo@0
|
70 {
|
paulo@0
|
71 time_t atime;
|
paulo@0
|
72
|
paulo@0
|
73 if ((atime = ATOUL (value)) == (unsigned long) -1)
|
paulo@0
|
74 atime = 0;
|
paulo@0
|
75
|
paulo@0
|
76 if (r_atime)
|
paulo@0
|
77 *r_atime = atime;
|
paulo@0
|
78
|
paulo@0
|
79 return TRUE;
|
paulo@0
|
80 }
|
paulo@0
|
81
|
paulo@0
|
82 /*****************************************************************************/
|
paulo@0
|
83
|
paulo@0
|
84 static char *new_webcache_url (const char *host, const char *path)
|
paulo@0
|
85 {
|
paulo@0
|
86 return stringf_dup ("http://%s/%s", host, STRING_NOTNULL(path));
|
paulo@0
|
87 }
|
paulo@0
|
88
|
paulo@0
|
89 static void ban_webcache (HttpRequest *req, const char *why)
|
paulo@0
|
90 {
|
paulo@0
|
91 char *url;
|
paulo@0
|
92
|
paulo@0
|
93 url = new_webcache_url (req->host, req->path);
|
paulo@0
|
94 GT->dbg (GT, "banning webcache %s", url);
|
paulo@0
|
95
|
paulo@0
|
96 file_cache_insert (bad_caches, url, why);
|
paulo@0
|
97 file_cache_sync (bad_caches);
|
paulo@0
|
98
|
paulo@0
|
99 free (url);
|
paulo@0
|
100 }
|
paulo@0
|
101
|
paulo@0
|
102 static void insert_webcache (const char *host_name, const char *remote_path,
|
paulo@0
|
103 time_t atime)
|
paulo@0
|
104 {
|
paulo@0
|
105 char *url;
|
paulo@0
|
106 char *field;
|
paulo@0
|
107
|
paulo@0
|
108 url = new_webcache_url (host_name, remote_path);
|
paulo@0
|
109 field = stringf_dup ("%lu", atime);
|
paulo@0
|
110
|
paulo@0
|
111 file_cache_insert (web_caches, url, field);
|
paulo@0
|
112
|
paulo@0
|
113 free (url);
|
paulo@0
|
114 free (field);
|
paulo@0
|
115 }
|
paulo@0
|
116
|
paulo@0
|
117 /*****************************************************************************/
|
paulo@0
|
118
|
paulo@0
|
119 static void handle_close_request (HttpRequest *req, int error_code)
|
paulo@0
|
120 {
|
paulo@0
|
121 String *s;
|
paulo@0
|
122
|
paulo@0
|
123 if (error_code < 0 || error_code < 200 || error_code >= 300)
|
paulo@0
|
124 {
|
paulo@0
|
125 if (error_code == -1)
|
paulo@0
|
126 {
|
paulo@0
|
127 /* the error was our fault, out of mem, etc. dont do anything */
|
paulo@0
|
128 GT->DBGFN (GT, "connect to server %s failed for some reason",
|
paulo@0
|
129 req->host);
|
paulo@0
|
130 }
|
paulo@0
|
131 else
|
paulo@0
|
132 {
|
paulo@0
|
133 char err[32];
|
paulo@0
|
134
|
paulo@0
|
135 snprintf (err, sizeof(err), "Received error %d", error_code);
|
paulo@0
|
136
|
paulo@0
|
137 /*
|
paulo@0
|
138 * Not found, internal server error, or too many redirects: ban
|
paulo@0
|
139 * the server's URL
|
paulo@0
|
140 */
|
paulo@0
|
141 GT->DBGFN (GT, "server %s returned error %i", req->host,
|
paulo@0
|
142 error_code);
|
paulo@0
|
143 ban_webcache (req, err);
|
paulo@0
|
144 }
|
paulo@0
|
145 }
|
paulo@0
|
146
|
paulo@0
|
147 /* TODO: this assumes this is the one hostfile request flying around,
|
paulo@0
|
148 * and not a urlfile request, which probably needs to be handled
|
paulo@0
|
149 * separately */
|
paulo@0
|
150 checking_caches = FALSE;
|
paulo@0
|
151
|
paulo@0
|
152 if ((s = req->data))
|
paulo@0
|
153 string_free (s);
|
paulo@0
|
154 }
|
paulo@0
|
155
|
paulo@0
|
156 static void parse_hostfile_response (HttpRequest *http_req, char *host_file)
|
paulo@0
|
157 {
|
paulo@0
|
158 int hosts = 0;
|
paulo@0
|
159 GtNode *node;
|
paulo@0
|
160 time_t now;
|
paulo@0
|
161
|
paulo@0
|
162 if (!host_file)
|
paulo@0
|
163 {
|
paulo@0
|
164 GT->DBGFN (GT, "empty host file from %s", http_req->host);
|
paulo@0
|
165 return;
|
paulo@0
|
166 }
|
paulo@0
|
167
|
paulo@0
|
168 GT->DBGFN (GT, "hostfile from server = %s", host_file);
|
paulo@0
|
169
|
paulo@0
|
170 now = time (NULL);
|
paulo@0
|
171
|
paulo@0
|
172 /*
|
paulo@0
|
173 * If the response start with "ERROR: " (or pseudo-html '<' char), ban the
|
paulo@0
|
174 * webcache.
|
paulo@0
|
175 */
|
paulo@0
|
176 if (!strncasecmp (host_file, "ERROR", sizeof ("ERROR") - 1) ||
|
paulo@0
|
177 host_file[0] == '<')
|
paulo@0
|
178 {
|
paulo@0
|
179 ban_webcache (http_req, "Malformed response content");
|
paulo@0
|
180 return;
|
paulo@0
|
181 }
|
paulo@0
|
182
|
paulo@0
|
183 while (host_file && *host_file)
|
paulo@0
|
184 {
|
paulo@0
|
185 char *host;
|
paulo@0
|
186 in_addr_t ip;
|
paulo@0
|
187 in_port_t port;
|
paulo@0
|
188
|
paulo@0
|
189 host = string_sep_set (&host_file, "\r\n");
|
paulo@0
|
190
|
paulo@0
|
191 ip = net_ip (string_sep (&host, ":"));
|
paulo@0
|
192 port = ATOI (host);
|
paulo@0
|
193
|
paulo@0
|
194 if (!port || !ip || ip == INADDR_NONE)
|
paulo@0
|
195 continue;
|
paulo@0
|
196
|
paulo@0
|
197 GT->DBGFN (GT, "registering %s:%hu (from cache %s)", net_ip_str (ip),
|
paulo@0
|
198 port, http_req->host);
|
paulo@0
|
199
|
paulo@0
|
200 /* register the hosts as ultrapeers */
|
paulo@0
|
201 node = gt_node_register (ip, port, GT_NODE_ULTRA);
|
paulo@0
|
202 hosts++;
|
paulo@0
|
203
|
paulo@0
|
204 if (!node)
|
paulo@0
|
205 continue;
|
paulo@0
|
206
|
paulo@0
|
207 /* set the vitality on this node to preserve it across restarts */
|
paulo@0
|
208 node->vitality = now;
|
paulo@0
|
209
|
paulo@0
|
210 /* might be connected already */
|
paulo@0
|
211 if (node->state != GT_NODE_DISCONNECTED)
|
paulo@0
|
212 continue;
|
paulo@0
|
213
|
paulo@0
|
214 /* try to connect to the first 5 */
|
paulo@0
|
215 if (hosts <= 5 && gt_conn_need_connections (GT_NODE_ULTRA))
|
paulo@0
|
216 gt_connect (node);
|
paulo@0
|
217
|
paulo@0
|
218 /* don't allow the cache to register an infinite number of hosts */
|
paulo@0
|
219 if (hosts >= 50)
|
paulo@0
|
220 break;
|
paulo@0
|
221 }
|
paulo@0
|
222
|
paulo@0
|
223 /* save the nodes we added to disk so we dont hit the caches again */
|
paulo@0
|
224 gt_node_list_save ();
|
paulo@0
|
225
|
paulo@0
|
226 /*
|
paulo@0
|
227 * Do an exponential backoff from the caches. If we were online and
|
paulo@0
|
228 * able to receive data, we should be getting node information
|
paulo@0
|
229 * some other way now.
|
paulo@0
|
230 */
|
paulo@0
|
231 if (hosts >= 5)
|
paulo@0
|
232 {
|
paulo@0
|
233 next_atime = now + backoff_time;
|
paulo@0
|
234 backoff_time *= 2;
|
paulo@0
|
235 }
|
paulo@0
|
236 }
|
paulo@0
|
237
|
paulo@0
|
238 static void parse_urlfile_response (HttpRequest *http_req, char *url_file)
|
paulo@0
|
239 {
|
paulo@0
|
240 int caches = 0;
|
paulo@0
|
241
|
paulo@0
|
242 if (!url_file)
|
paulo@0
|
243 {
|
paulo@0
|
244 GT->DBGFN (GT, "empty url file from %s", http_req->host);
|
paulo@0
|
245 return;
|
paulo@0
|
246 }
|
paulo@0
|
247
|
paulo@0
|
248 GT->DBGFN (GT, "urlfile from server = %s", url_file);
|
paulo@0
|
249
|
paulo@0
|
250 while (url_file && *url_file)
|
paulo@0
|
251 {
|
paulo@0
|
252 char *url;
|
paulo@0
|
253 char *host_name;
|
paulo@0
|
254 char *remote_path;
|
paulo@0
|
255
|
paulo@0
|
256 url = string_sep_set (&url_file, "\r\n");
|
paulo@0
|
257
|
paulo@0
|
258 /* skip past http:// */
|
paulo@0
|
259 string_sep (&url, "http://");
|
paulo@0
|
260
|
paulo@0
|
261 host_name = string_sep (&url, "/");
|
paulo@0
|
262 remote_path = url;
|
paulo@0
|
263
|
paulo@0
|
264 /* NOTE: remote_path is possibly empty */
|
paulo@0
|
265 if (!host_name)
|
paulo@0
|
266 continue;
|
paulo@0
|
267
|
paulo@0
|
268 url = stringf ("http://%s/%s", host_name, STRING_NOTNULL(remote_path));
|
paulo@0
|
269
|
paulo@0
|
270 /* if the webcache is already in our db, skip it */
|
paulo@0
|
271 if (file_cache_lookup (web_caches, url))
|
paulo@0
|
272 continue;
|
paulo@0
|
273
|
paulo@0
|
274 /*
|
paulo@0
|
275 * Only allow caches to register two more caches: this
|
paulo@0
|
276 * small number helps to avoid our list of caches getting
|
paulo@0
|
277 * polluted.
|
paulo@0
|
278 */
|
paulo@0
|
279 if (++caches > 2)
|
paulo@0
|
280 break;
|
paulo@0
|
281
|
paulo@0
|
282 /* format is: <url> <last time visited> */
|
paulo@0
|
283 file_cache_insert (web_caches, url, "0");
|
paulo@0
|
284 }
|
paulo@0
|
285
|
paulo@0
|
286 /* sync the pending web caches to disk */
|
paulo@0
|
287 file_cache_sync (web_caches);
|
paulo@0
|
288 }
|
paulo@0
|
289
|
paulo@0
|
290 static void end_request (HttpRequest *req, char *data)
|
paulo@0
|
291 {
|
paulo@0
|
292 char *str = req->request;
|
paulo@0
|
293
|
paulo@0
|
294 if (str && !strncmp (str, "hostfile", strlen ("hostfile")))
|
paulo@0
|
295 parse_hostfile_response (req, data);
|
paulo@0
|
296 else if (str && !strncmp (str, "urlfile", strlen ("urlfile")))
|
paulo@0
|
297 parse_urlfile_response (req, data);
|
paulo@0
|
298 else
|
paulo@0
|
299 abort ();
|
paulo@0
|
300 }
|
paulo@0
|
301
|
paulo@0
|
302 /*****************************************************************************/
|
paulo@0
|
303
|
paulo@0
|
304 /*
|
paulo@0
|
305 * Return TRUE if newname is in the same domain as oldname. For example,
|
paulo@0
|
306 * "new.gwc.example.com", "example.com", and "cache.example.com" are all
|
paulo@0
|
307 * considered in the same domain as "www.example.com".
|
paulo@0
|
308 *
|
paulo@0
|
309 * This is called on redirects, to make sure the cache can't redirect to an
|
paulo@0
|
310 * innocent site as part of a DDoS attack.
|
paulo@0
|
311 */
|
paulo@0
|
312 static BOOL in_same_domain (const char *oldname, const char *newname)
|
paulo@0
|
313 {
|
paulo@0
|
314 return FALSE;
|
paulo@0
|
315 #if 0
|
paulo@0
|
316 const char *p;
|
paulo@0
|
317 const char *largest = NULL;
|
paulo@0
|
318 int periods = 0;
|
paulo@0
|
319
|
paulo@0
|
320 p = newname;
|
paulo@0
|
321
|
paulo@0
|
322 /* get the largest common substring */
|
paulo@0
|
323 while (p != NULL)
|
paulo@0
|
324 {
|
paulo@0
|
325 if ((largest = strstr (oldname, p)))
|
paulo@0
|
326 break;
|
paulo@0
|
327
|
paulo@0
|
328 /* advance to next domain part */
|
paulo@0
|
329 p = strchr (p + 1, '.');
|
paulo@0
|
330 }
|
paulo@0
|
331
|
paulo@0
|
332 if (!largest)
|
paulo@0
|
333 return FALSE;
|
paulo@0
|
334
|
paulo@0
|
335 /*
|
paulo@0
|
336 * Make sure the substring matches completely to the end. This will
|
paulo@0
|
337 * actually fail when it shouldn't if one name includes the '.' toplevel
|
paulo@0
|
338 * domain and one doesn't. Oh well.
|
paulo@0
|
339 */
|
paulo@0
|
340 if (strcmp (largest, p) != 0)
|
paulo@0
|
341 return FALSE;
|
paulo@0
|
342
|
paulo@0
|
343 /*
|
paulo@0
|
344 * Count the number of periods to find the number of subdomains in the
|
paulo@0
|
345 * largest common substring.
|
paulo@0
|
346 */
|
paulo@0
|
347 for (p = largest; *p != 0; p++)
|
paulo@0
|
348 {
|
paulo@0
|
349 if (*p == '.')
|
paulo@0
|
350 periods++;
|
paulo@0
|
351 }
|
paulo@0
|
352
|
paulo@0
|
353 /*
|
paulo@0
|
354 * If the last character is the root '.', subtract one, since we are
|
paulo@0
|
355 * looking for the number of common subdomains, and the root is shared by
|
paulo@0
|
356 * all names.
|
paulo@0
|
357 */
|
paulo@0
|
358 if (largest[strlen (largest) - 1] == '.')
|
paulo@0
|
359 periods--;
|
paulo@0
|
360
|
paulo@0
|
361 /*
|
paulo@0
|
362 * If there are two periods, at least two toplevel domains match.
|
paulo@0
|
363 */
|
paulo@0
|
364 if (periods >= 2)
|
paulo@0
|
365 return TRUE;
|
paulo@0
|
366
|
paulo@0
|
367 /*
|
paulo@0
|
368 * If there is only one period shared, the names MAY be in the same
|
paulo@0
|
369 * domain: one of the names has to be completely contained within the
|
paulo@0
|
370 * other, such as the case of "foo.example.com" and "example.com".
|
paulo@0
|
371 */
|
paulo@0
|
372 if (periods == 1 &&
|
paulo@0
|
373 (strcmp (largest, oldname) == 0 || strcmp (largest, newname) == 0))
|
paulo@0
|
374 {
|
paulo@0
|
375 return TRUE;
|
paulo@0
|
376 }
|
paulo@0
|
377
|
paulo@0
|
378 /* not in same domain */
|
paulo@0
|
379 return FALSE;
|
paulo@0
|
380 #endif
|
paulo@0
|
381 }
|
paulo@0
|
382
|
paulo@0
|
383 /*
|
paulo@0
|
384 * Called to when the webcache sends a 300-level response with a provided
|
paulo@0
|
385 * Location: header. Have to make sure the domain the cache directs us
|
paulo@0
|
386 * to is the same.
|
paulo@0
|
387 */
|
paulo@0
|
388 static BOOL handle_redirect (HttpRequest *req, const char *new_host,
|
paulo@0
|
389 const char *new_path)
|
paulo@0
|
390 {
|
paulo@0
|
391 assert (new_host != NULL);
|
paulo@0
|
392
|
paulo@0
|
393 if (in_same_domain (req->host, new_host) == FALSE)
|
paulo@0
|
394 return FALSE;
|
paulo@0
|
395
|
paulo@0
|
396 /* might want to do something else if the ban list later becomes per host
|
paulo@0
|
397 * rather than per URL */
|
paulo@0
|
398 ban_webcache (req, "Redirected");
|
paulo@0
|
399
|
paulo@0
|
400 GT->DBGFN (GT, "Redirecting to new webcache %s/%s", new_host, new_path);
|
paulo@0
|
401
|
paulo@0
|
402 insert_webcache (new_host, new_path, time (NULL));
|
paulo@0
|
403 file_cache_sync (web_caches);
|
paulo@0
|
404
|
paulo@0
|
405 return TRUE;
|
paulo@0
|
406 }
|
paulo@0
|
407
|
paulo@0
|
408 /*****************************************************************************/
|
paulo@0
|
409
|
paulo@0
|
410 static BOOL handle_recv (HttpRequest *req, char *data, size_t len)
|
paulo@0
|
411 {
|
paulo@0
|
412 String *s;
|
paulo@0
|
413
|
paulo@0
|
414 /* EOF */
|
paulo@0
|
415 if (!data)
|
paulo@0
|
416 {
|
paulo@0
|
417 char *str = NULL;
|
paulo@0
|
418
|
paulo@0
|
419 if ((s = req->data))
|
paulo@0
|
420 str = s->str;
|
paulo@0
|
421
|
paulo@0
|
422 GT->DBGFN (GT, "read %s from server %s", str, req->host);
|
paulo@0
|
423 end_request (req, str);
|
paulo@0
|
424
|
paulo@0
|
425 /* clear data link */
|
paulo@0
|
426 req->data = NULL;
|
paulo@0
|
427
|
paulo@0
|
428 return TRUE;
|
paulo@0
|
429 }
|
paulo@0
|
430
|
paulo@0
|
431 if (!len)
|
paulo@0
|
432 return TRUE;
|
paulo@0
|
433
|
paulo@0
|
434 GT->DBGFN (GT, "server sent us: %s", data);
|
paulo@0
|
435
|
paulo@0
|
436 if (!(s = req->data) && !(s = req->data = string_new (NULL, 0, 0, TRUE)))
|
paulo@0
|
437 return FALSE;
|
paulo@0
|
438
|
paulo@0
|
439 if (string_append (s, data) != len)
|
paulo@0
|
440 {
|
paulo@0
|
441 GT->DBGFN (GT, "string append failed");
|
paulo@0
|
442 return FALSE;
|
paulo@0
|
443 }
|
paulo@0
|
444
|
paulo@0
|
445 return TRUE;
|
paulo@0
|
446 }
|
paulo@0
|
447
|
paulo@0
|
448 /*****************************************************************************/
|
paulo@0
|
449
|
paulo@0
|
450 static BOOL handle_add_headers (HttpRequest *req, Dataset **headers)
|
paulo@0
|
451 {
|
paulo@0
|
452 /* don't let intermediaries cache our request, I think */
|
paulo@0
|
453 dataset_insertstr (headers, "Cache-Control", "no-cache");
|
paulo@0
|
454
|
paulo@0
|
455 return TRUE;
|
paulo@0
|
456 }
|
paulo@0
|
457
|
paulo@0
|
458 /*****************************************************************************/
|
paulo@0
|
459
|
paulo@0
|
460 static BOOL parse_host_and_port (char **r_host, in_port_t *r_port)
|
paulo@0
|
461 {
|
paulo@0
|
462 char *str;
|
paulo@0
|
463 char *host;
|
paulo@0
|
464 long port;
|
paulo@0
|
465
|
paulo@0
|
466 str = *r_host;
|
paulo@0
|
467
|
paulo@0
|
468 if (r_port)
|
paulo@0
|
469 *r_port = 80;
|
paulo@0
|
470
|
paulo@0
|
471 /* skip leading 'http://' if found */
|
paulo@0
|
472 if (strstr (str, "http://"))
|
paulo@0
|
473 str += strlen ("http://");
|
paulo@0
|
474
|
paulo@0
|
475 host = string_sep (&str, ":");
|
paulo@0
|
476
|
paulo@0
|
477 if (!host)
|
paulo@0
|
478 return FALSE;
|
paulo@0
|
479
|
paulo@0
|
480 *r_host = host;
|
paulo@0
|
481
|
paulo@0
|
482 if (str && !string_isempty (str))
|
paulo@0
|
483 {
|
paulo@0
|
484 port = gift_strtol (str);
|
paulo@0
|
485
|
paulo@0
|
486 /* make sure port is valid */
|
paulo@0
|
487 if (port <= 0 || port >= 65536)
|
paulo@0
|
488 return FALSE;
|
paulo@0
|
489
|
paulo@0
|
490 *r_port = port;
|
paulo@0
|
491 }
|
paulo@0
|
492
|
paulo@0
|
493 return TRUE;
|
paulo@0
|
494 }
|
paulo@0
|
495
|
paulo@0
|
496 static TCPC *open_http_connection (HttpRequest *req, const char *http_name)
|
paulo@0
|
497 {
|
paulo@0
|
498 in_addr_t ip;
|
paulo@0
|
499 in_port_t port;
|
paulo@0
|
500 char *str;
|
paulo@0
|
501 char *name;
|
paulo@0
|
502 TCPC *c;
|
paulo@0
|
503 struct hostent *host;
|
paulo@0
|
504
|
paulo@0
|
505 if (!http_name)
|
paulo@0
|
506 return NULL;
|
paulo@0
|
507
|
paulo@0
|
508 if (!(str = STRDUP (http_name)))
|
paulo@0
|
509 return NULL;
|
paulo@0
|
510
|
paulo@0
|
511 name = str;
|
paulo@0
|
512
|
paulo@0
|
513 if (!parse_host_and_port (&name, &port))
|
paulo@0
|
514 {
|
paulo@0
|
515 GT->DBGFN (GT, "error parsing hostname \"%s\"", str);
|
paulo@0
|
516 free (str);
|
paulo@0
|
517 return NULL;
|
paulo@0
|
518 }
|
paulo@0
|
519
|
paulo@0
|
520 if (!(host = gt_dns_lookup (name)))
|
paulo@0
|
521 {
|
paulo@0
|
522 free (str);
|
paulo@0
|
523 return NULL;
|
paulo@0
|
524 }
|
paulo@0
|
525
|
paulo@0
|
526 /* ip is in network-order already */
|
paulo@0
|
527 memcpy (&ip, host->h_addr, MIN (host->h_length, sizeof (ip)));
|
paulo@0
|
528
|
paulo@0
|
529 if (net_match_host (ip, "LOCAL"))
|
paulo@0
|
530 {
|
paulo@0
|
531 free (str);
|
paulo@0
|
532 ban_webcache (req, "Resolved to local IP");
|
paulo@0
|
533 return NULL;
|
paulo@0
|
534 }
|
paulo@0
|
535
|
paulo@0
|
536 c = tcp_open (ip, port, FALSE);
|
paulo@0
|
537 if (!c)
|
paulo@0
|
538 {
|
paulo@0
|
539 GT->DBGFN (GT, "couldn't open connection to %s [%s]: %s",
|
paulo@0
|
540 http_name, net_ip_str (ip), GIFT_NETERROR());
|
paulo@0
|
541 }
|
paulo@0
|
542
|
paulo@0
|
543 free (str);
|
paulo@0
|
544 return c;
|
paulo@0
|
545 }
|
paulo@0
|
546
|
paulo@0
|
547 /* return the name we have to lookup */
|
paulo@0
|
548 static char *get_http_name (char *name)
|
paulo@0
|
549 {
|
paulo@0
|
550 char *proxy;
|
paulo@0
|
551 char *host;
|
paulo@0
|
552
|
paulo@0
|
553 host = name;
|
paulo@0
|
554 proxy = HTTP_PROXY;
|
paulo@0
|
555
|
paulo@0
|
556 string_trim (proxy);
|
paulo@0
|
557
|
paulo@0
|
558 if (proxy && !string_isempty (proxy))
|
paulo@0
|
559 {
|
paulo@0
|
560 /* connect to the proxy instead */
|
paulo@0
|
561 if (STRCMP (proxy, gt_proxy_server) != 0)
|
paulo@0
|
562 {
|
paulo@0
|
563 GT->DBGFN (GT, "using proxy server %s", proxy);
|
paulo@0
|
564 free (gt_proxy_server);
|
paulo@0
|
565 gt_proxy_server = STRDUP (proxy);
|
paulo@0
|
566 }
|
paulo@0
|
567
|
paulo@0
|
568 host = proxy;
|
paulo@0
|
569 }
|
paulo@0
|
570
|
paulo@0
|
571 return host;
|
paulo@0
|
572 }
|
paulo@0
|
573
|
paulo@0
|
574 static void check_dns_error (const char *name, HttpRequest *req)
|
paulo@0
|
575 {
|
paulo@0
|
576 int error;
|
paulo@0
|
577
|
paulo@0
|
578 error = gt_dns_get_errno ();
|
paulo@0
|
579
|
paulo@0
|
580 if (!error)
|
paulo@0
|
581 return;
|
paulo@0
|
582
|
paulo@0
|
583 GT->DBGFN (GT, "lookup failed on \"%s\": %s", name, gt_dns_strerror(error));
|
paulo@0
|
584
|
paulo@0
|
585 /* ban the host, but only if not using a proxy server */
|
paulo@0
|
586 if (error == HOST_NOT_FOUND && gt_proxy_server == NULL)
|
paulo@0
|
587 {
|
paulo@0
|
588 GT->DBGFN (GT, "webcache \"%s\" not in DNS. banning", name);
|
paulo@0
|
589 ban_webcache (req, "Host not found in DNS");
|
paulo@0
|
590 return;
|
paulo@0
|
591 }
|
paulo@0
|
592 }
|
paulo@0
|
593
|
paulo@0
|
594 static BOOL make_request (char *host_name, char *remote_path, char *request)
|
paulo@0
|
595 {
|
paulo@0
|
596 HttpRequest *req;
|
paulo@0
|
597 TCPC *c;
|
paulo@0
|
598 char *resolve_name;
|
paulo@0
|
599 char *url;
|
paulo@0
|
600
|
paulo@0
|
601 url = stringf_dup ("http://%s/%s", host_name, STRING_NOTNULL(remote_path));
|
paulo@0
|
602
|
paulo@0
|
603 if (!(req = gt_http_request_new (url, request)))
|
paulo@0
|
604 {
|
paulo@0
|
605 free (url);
|
paulo@0
|
606 return FALSE;
|
paulo@0
|
607 }
|
paulo@0
|
608
|
paulo@0
|
609 free (url);
|
paulo@0
|
610
|
paulo@0
|
611 resolve_name = get_http_name (host_name);
|
paulo@0
|
612
|
paulo@0
|
613 gt_dns_set_errno (0);
|
paulo@0
|
614
|
paulo@0
|
615 if (!(c = open_http_connection (req, resolve_name)))
|
paulo@0
|
616 {
|
paulo@0
|
617 check_dns_error (resolve_name, req);
|
paulo@0
|
618 gt_http_request_close (req, -1);
|
paulo@0
|
619 return FALSE;
|
paulo@0
|
620 }
|
paulo@0
|
621
|
paulo@0
|
622 GT->DBGFN (GT, "opening connection to %s [%s]",
|
paulo@0
|
623 resolve_name, net_ip_str (c->host));
|
paulo@0
|
624
|
paulo@0
|
625 req->recv_func = handle_recv;
|
paulo@0
|
626 req->add_header_func = handle_add_headers;
|
paulo@0
|
627 req->close_req_func = handle_close_request;
|
paulo@0
|
628 req->redirect_func = handle_redirect;
|
paulo@0
|
629
|
paulo@0
|
630 gt_http_request_set_conn (req, c); /* setup references */
|
paulo@0
|
631 gt_http_request_set_proxy (req, gt_proxy_server); /* maybe use proxy */
|
paulo@0
|
632 gt_http_request_set_timeout (req, 2 * MINUTES); /* don't wait forever */
|
paulo@0
|
633 gt_http_request_set_max_len (req, 65536); /* don't read forever */
|
paulo@0
|
634
|
paulo@0
|
635 input_add (c->fd, c, INPUT_WRITE,
|
paulo@0
|
636 (InputCallback)gt_http_request_handle, TIMEOUT_DEF);
|
paulo@0
|
637
|
paulo@0
|
638 return TRUE;
|
paulo@0
|
639 }
|
paulo@0
|
640
|
paulo@0
|
641 /*****************************************************************************/
|
paulo@0
|
642
|
paulo@0
|
643 struct find_rand_args
|
paulo@0
|
644 {
|
paulo@0
|
645 int n;
|
paulo@0
|
646 time_t now;
|
paulo@0
|
647 char *url;
|
paulo@0
|
648 char *field;
|
paulo@0
|
649 };
|
paulo@0
|
650
|
paulo@0
|
651 /* get a random cache from the webcaches dataset */
|
paulo@0
|
652 static void foreach_rand_cache (ds_data_t *key, ds_data_t *value,
|
paulo@0
|
653 struct find_rand_args *args)
|
paulo@0
|
654 {
|
paulo@0
|
655 time_t atime;
|
paulo@0
|
656 float range = args->n;
|
paulo@0
|
657 char *str;
|
paulo@0
|
658 char *url = key->data;
|
paulo@0
|
659 char *hostname, *path;
|
paulo@0
|
660 int ret;
|
paulo@0
|
661
|
paulo@0
|
662 if (!parse_web_cache_value (value->data, &atime))
|
paulo@0
|
663 return;
|
paulo@0
|
664
|
paulo@0
|
665 /* skip the cache entirely if we've retried too soon */
|
paulo@0
|
666 if (args->now - atime < CACHE_RETRY_TIME)
|
paulo@0
|
667 return;
|
paulo@0
|
668
|
paulo@0
|
669 /*
|
paulo@0
|
670 * Make sure the cache has a parseable url
|
paulo@0
|
671 *
|
paulo@0
|
672 * TODO: This is ugly, it really should be parsed into a
|
paulo@0
|
673 * a data structure once instead.
|
paulo@0
|
674 */
|
paulo@0
|
675 str = STRDUP (url);
|
paulo@0
|
676 ret = gt_http_url_parse (str, &hostname, &path);
|
paulo@0
|
677 free (str);
|
paulo@0
|
678
|
paulo@0
|
679 if (!ret)
|
paulo@0
|
680 {
|
paulo@0
|
681 GT->warn (GT, "bad webcache url \"%s\" from %s/gwebcaches",
|
paulo@0
|
682 key->data, gift_conf_path ("Gnutella"));
|
paulo@0
|
683 return;
|
paulo@0
|
684 }
|
paulo@0
|
685
|
paulo@0
|
686 /* decrease probability of selecting the next web cache */
|
paulo@0
|
687 args->n++;
|
paulo@0
|
688
|
paulo@0
|
689 /*
|
paulo@0
|
690 * Select this webcache with probability 1/n.
|
paulo@0
|
691 *
|
paulo@0
|
692 * Also select this cache if we haven't chosen one yet, which may be the
|
paulo@0
|
693 * case on if the index of the cache is > 0 when there are banned caches.
|
paulo@0
|
694 */
|
paulo@0
|
695 if (args->url == NULL ||
|
paulo@0
|
696 range * rand() / (RAND_MAX + 1.0) < 1.0)
|
paulo@0
|
697 {
|
paulo@0
|
698 char *keystr = key->data;
|
paulo@0
|
699 char *valuestr = value->data;
|
paulo@0
|
700
|
paulo@0
|
701 /* check if this is a bad gwebcache */
|
paulo@0
|
702 if (file_cache_lookup (bad_caches, url))
|
paulo@0
|
703 {
|
paulo@0
|
704 #if 1
|
paulo@0
|
705 GT->warn (GT, "skipping webcache %s, in bad gwebcaches", url);
|
paulo@0
|
706 #endif
|
paulo@0
|
707 /* pretend we didn't select this to ensure equal distribution */
|
paulo@0
|
708 args->n--;
|
paulo@0
|
709
|
paulo@0
|
710 return;
|
paulo@0
|
711 }
|
paulo@0
|
712
|
paulo@0
|
713 /* free the old values */
|
paulo@0
|
714 free (args->url);
|
paulo@0
|
715 free (args->field);
|
paulo@0
|
716
|
paulo@0
|
717 args->url = STRDUP (keystr);
|
paulo@0
|
718 args->field = STRDUP (valuestr);
|
paulo@0
|
719 }
|
paulo@0
|
720 }
|
paulo@0
|
721
|
paulo@0
|
722 static BOOL get_random_cache (time_t now, char **r_host_name,
|
paulo@0
|
723 char **r_remote_path)
|
paulo@0
|
724 {
|
paulo@0
|
725 int ret;
|
paulo@0
|
726 struct find_rand_args args;
|
paulo@0
|
727
|
paulo@0
|
728 args.n = 1; /* initial probability */
|
paulo@0
|
729 args.now = now; /* current time */
|
paulo@0
|
730 args.url = NULL;
|
paulo@0
|
731 args.field = NULL;
|
paulo@0
|
732
|
paulo@0
|
733 dataset_foreach (web_caches->d, DS_FOREACH(foreach_rand_cache), &args);
|
paulo@0
|
734
|
paulo@0
|
735 if (!args.url)
|
paulo@0
|
736 {
|
paulo@0
|
737 GT->DBGFN (GT, "couldn't find random cache");
|
paulo@0
|
738 return FALSE;
|
paulo@0
|
739 }
|
paulo@0
|
740
|
paulo@0
|
741 ret = gt_http_url_parse (args.url, r_host_name, r_remote_path);
|
paulo@0
|
742
|
paulo@0
|
743 if (!*r_host_name || !*r_remote_path)
|
paulo@0
|
744 {
|
paulo@0
|
745 free (args.url);
|
paulo@0
|
746 free (args.field);
|
paulo@0
|
747 return FALSE;
|
paulo@0
|
748 }
|
paulo@0
|
749
|
paulo@0
|
750 *r_host_name = STRDUP (*r_host_name);
|
paulo@0
|
751 *r_remote_path = STRDUP (*r_remote_path);
|
paulo@0
|
752
|
paulo@0
|
753 /* free the original buffer */
|
paulo@0
|
754 free (args.url);
|
paulo@0
|
755 free (args.field);
|
paulo@0
|
756
|
paulo@0
|
757 return ret;
|
paulo@0
|
758 }
|
paulo@0
|
759
|
paulo@0
|
760 static void access_gwebcaches (void)
|
paulo@0
|
761 {
|
paulo@0
|
762 int len;
|
paulo@0
|
763 char *host_name;
|
paulo@0
|
764 char *remote_path;
|
paulo@0
|
765 time_t now;
|
paulo@0
|
766 int host_requests = 0;
|
paulo@0
|
767 #if 0
|
paulo@0
|
768 int url_requests = 0;
|
paulo@0
|
769 #endif
|
paulo@0
|
770 int max_requests = 1;
|
paulo@0
|
771 BOOL ret;
|
paulo@0
|
772 BOOL need_sync;
|
paulo@0
|
773
|
paulo@0
|
774 /*
|
paulo@0
|
775 * We may get called while a check of the gwebcaches is already
|
paulo@0
|
776 * in progress.
|
paulo@0
|
777 */
|
paulo@0
|
778 if (checking_caches)
|
paulo@0
|
779 {
|
paulo@0
|
780 GT->DBGFN (GT, "Access already in progress");
|
paulo@0
|
781 return;
|
paulo@0
|
782 }
|
paulo@0
|
783
|
paulo@0
|
784 now = time (NULL);
|
paulo@0
|
785
|
paulo@0
|
786 len = dataset_length (web_caches->d);
|
paulo@0
|
787
|
paulo@0
|
788 if (max_requests > len)
|
paulo@0
|
789 max_requests = len;
|
paulo@0
|
790
|
paulo@0
|
791 need_sync = FALSE;
|
paulo@0
|
792
|
paulo@0
|
793 while (host_requests < max_requests)
|
paulo@0
|
794 {
|
paulo@0
|
795 if (!get_random_cache (now, &host_name, &remote_path))
|
paulo@0
|
796 {
|
paulo@0
|
797 GT->DBGFN (GT, "error looking up cache");
|
paulo@0
|
798 break;
|
paulo@0
|
799 }
|
paulo@0
|
800
|
paulo@0
|
801 #if 0
|
paulo@0
|
802 /* make a url request sometimes to keep the cache file up to date, but
|
paulo@0
|
803 * mostly ask for hosts */
|
paulo@0
|
804 if (10.0 * rand() / (RAND_MAX + 1.0) < 1.0)
|
paulo@0
|
805 {
|
paulo@0
|
806 ret = make_request (host_name, remote_path,
|
paulo@0
|
807 "urlfile=1&client=GIFT&version=" GT_VERSION);
|
paulo@0
|
808 url_requests++;
|
paulo@0
|
809 }
|
paulo@0
|
810 else
|
paulo@0
|
811 #endif
|
paulo@0
|
812 {
|
paulo@0
|
813 ret = make_request (host_name, remote_path,
|
paulo@0
|
814 "hostfile=1&client=GIFT&version=" GT_VERSION);
|
paulo@0
|
815
|
paulo@0
|
816 if (ret)
|
paulo@0
|
817 checking_caches = TRUE;
|
paulo@0
|
818
|
paulo@0
|
819 host_requests++;
|
paulo@0
|
820 }
|
paulo@0
|
821
|
paulo@0
|
822 if (ret)
|
paulo@0
|
823 {
|
paulo@0
|
824 GT->DBGFN (GT, "hitting web cache [total cache hits %u] "
|
paulo@0
|
825 "(cache: http://%s/%s)", cache_hits,
|
paulo@0
|
826 host_name, STRING_NOTNULL(remote_path));
|
paulo@0
|
827
|
paulo@0
|
828 cache_hits++;
|
paulo@0
|
829 need_sync = TRUE;
|
paulo@0
|
830
|
paulo@0
|
831 /* reset the atime for the cache */
|
paulo@0
|
832 insert_webcache (host_name, remote_path, now);
|
paulo@0
|
833 }
|
paulo@0
|
834
|
paulo@0
|
835 free (host_name);
|
paulo@0
|
836 free (remote_path);
|
paulo@0
|
837 }
|
paulo@0
|
838
|
paulo@0
|
839 /* only sync when we successfully accessed a cache */
|
paulo@0
|
840 if (need_sync)
|
paulo@0
|
841 file_cache_sync (web_caches);
|
paulo@0
|
842 }
|
paulo@0
|
843
|
paulo@0
|
844 static BOOL webcache_update (void *udata)
|
paulo@0
|
845 {
|
paulo@0
|
846 char *webcache_file;
|
paulo@0
|
847 int web_exists;
|
paulo@0
|
848 time_t now;
|
paulo@0
|
849 size_t nodes_len;
|
paulo@0
|
850 struct stat st;
|
paulo@0
|
851
|
paulo@0
|
852 if (GNUTELLA_LOCAL_MODE)
|
paulo@0
|
853 return TRUE;
|
paulo@0
|
854
|
paulo@0
|
855 now = time (NULL);
|
paulo@0
|
856 nodes_len = gt_conn_length (GT_NODE_NONE, GT_NODE_ANY);
|
paulo@0
|
857
|
paulo@0
|
858 /*
|
paulo@0
|
859 * If we've already accessed the caches successfully, we won't
|
paulo@0
|
860 * allow another access to go through, _unless_ the node list
|
paulo@0
|
861 * is small enough, in which case it could be we really do need
|
paulo@0
|
862 * to access the caches.
|
paulo@0
|
863 */
|
paulo@0
|
864 if (now < next_atime && nodes_len >= 20)
|
paulo@0
|
865 return FALSE;
|
paulo@0
|
866
|
paulo@0
|
867 webcache_file = STRDUP (gift_conf_path ("Gnutella/gwebcaches"));
|
paulo@0
|
868 web_exists = file_stat (webcache_file, &st);
|
paulo@0
|
869
|
paulo@0
|
870 if (!web_exists)
|
paulo@0
|
871 {
|
paulo@0
|
872 GIFT_ERROR (("gwebcaches file doesn't exist"));
|
paulo@0
|
873 return FALSE;
|
paulo@0
|
874 }
|
paulo@0
|
875
|
paulo@0
|
876 /*
|
paulo@0
|
877 * next_atime, the absolute next time we allow ourselves to contact the
|
paulo@0
|
878 * caches, gets set when we sucessfully access the caches, and if we
|
paulo@0
|
879 * manage to get some hosts from a cache we access in an exponentially
|
paulo@0
|
880 * decreasing interval.
|
paulo@0
|
881 */
|
paulo@0
|
882 access_gwebcaches ();
|
paulo@0
|
883
|
paulo@0
|
884 free (webcache_file);
|
paulo@0
|
885 return TRUE;
|
paulo@0
|
886 }
|
paulo@0
|
887
|
paulo@0
|
888 /*****************************************************************************/
|
paulo@0
|
889
|
paulo@0
|
890 void gt_web_cache_update (void)
|
paulo@0
|
891 {
|
paulo@0
|
892 webcache_update (NULL);
|
paulo@0
|
893 }
|
paulo@0
|
894
|
paulo@0
|
895 BOOL gt_web_cache_init (void)
|
paulo@0
|
896 {
|
paulo@0
|
897 /*
|
paulo@0
|
898 * Copy the gwebcaches file to from the data dir to
|
paulo@0
|
899 * ~/.giFT/Gnutella if it is newer or if ~/.giFT/Gnutella/gwebcaches
|
paulo@0
|
900 * doesn't exist.
|
paulo@0
|
901 */
|
paulo@0
|
902 gt_config_load_file ("Gnutella/gwebcaches", TRUE, FALSE);
|
paulo@0
|
903
|
paulo@0
|
904 web_caches = file_cache_new (gift_conf_path ("Gnutella/gwebcaches"));
|
paulo@0
|
905 bad_caches = file_cache_new (gift_conf_path ("Gnutella/bad_gwebcaches"));
|
paulo@0
|
906
|
paulo@0
|
907 if (!web_caches)
|
paulo@0
|
908 return FALSE;
|
paulo@0
|
909
|
paulo@0
|
910 return TRUE;
|
paulo@0
|
911 }
|
paulo@0
|
912
|
paulo@0
|
913 void gt_web_cache_cleanup (void)
|
paulo@0
|
914 {
|
paulo@0
|
915 file_cache_free (web_caches);
|
paulo@0
|
916 web_caches = NULL;
|
paulo@0
|
917
|
paulo@0
|
918 file_cache_free (bad_caches);
|
paulo@0
|
919 bad_caches = NULL;
|
paulo@0
|
920
|
paulo@0
|
921 cache_hits = 0;
|
paulo@0
|
922 next_atime = 0;
|
paulo@0
|
923
|
paulo@0
|
924 checking_caches = FALSE;
|
paulo@0
|
925 }
|