annotate src/gt_search_exec.c @ 0:d39e1d0d75b6

initial add
author paulo@hit-nxdomain.opendns.com
date Sat, 20 Feb 2010 21:18:28 -0800
parents
children
rev   line source
paulo@0 1 /*
paulo@0 2 * $Id: gt_search_exec.c,v 1.23 2004/01/04 05:15:28 hipnod Exp $
paulo@0 3 *
paulo@0 4 * Copyright (C) 2001-2003 giFT project (gift.sourceforge.net)
paulo@0 5 *
paulo@0 6 * This program is free software; you can redistribute it and/or modify it
paulo@0 7 * under the terms of the GNU General Public License as published by the
paulo@0 8 * Free Software Foundation; either version 2, or (at your option) any
paulo@0 9 * later version.
paulo@0 10 *
paulo@0 11 * This program is distributed in the hope that it will be useful, but
paulo@0 12 * WITHOUT ANY WARRANTY; without even the implied warranty of
paulo@0 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
paulo@0 14 * General Public License for more details.
paulo@0 15 */
paulo@0 16
paulo@0 17 #include "gt_gnutella.h"
paulo@0 18
paulo@0 19 #include "gt_search.h"
paulo@0 20 #include "gt_search_exec.h"
paulo@0 21
paulo@0 22 #include "gt_query_route.h" /* QRP_DELIMITERS */
paulo@0 23 #include "gt_share.h" /* gt_share_local_lookup_by_urn */
paulo@0 24 #include "gt_share_file.h"
paulo@0 25
paulo@0 26 #include "sha1.h" /* sha1_string */
paulo@0 27 #include "trie.h"
paulo@0 28
paulo@0 29 #include <libgift/stopwatch.h>
paulo@0 30
paulo@0 31 /******************************************************************************/
paulo@0 32
paulo@0 33 #define MAX_RESULTS 200
paulo@0 34
paulo@0 35 /******************************************************************************/
paulo@0 36
paulo@0 37 static Trie *gt_search_trie;
paulo@0 38
paulo@0 39 static StopWatch *search_sw;
paulo@0 40
paulo@0 41 /******************************************************************************/
paulo@0 42
paulo@0 43 GtTokenSet *gt_token_set_new ()
paulo@0 44 {
paulo@0 45 GtTokenSet *ts;
paulo@0 46
paulo@0 47 if (!(ts = MALLOC (sizeof (GtTokenSet))))
paulo@0 48 return NULL;
paulo@0 49
paulo@0 50 return ts;
paulo@0 51 }
paulo@0 52
paulo@0 53 void gt_token_set_free (GtTokenSet *ts)
paulo@0 54 {
paulo@0 55 if (!ts)
paulo@0 56 return;
paulo@0 57
paulo@0 58 free (ts->data);
paulo@0 59 free (ts);
paulo@0 60 }
paulo@0 61
paulo@0 62 void gt_token_set_append (GtTokenSet *ts, uint32_t token)
paulo@0 63 {
paulo@0 64 if (ts->len >= ts->data_len)
paulo@0 65 {
paulo@0 66 uint32_t *new_tokens;
paulo@0 67
paulo@0 68 ts->data_len += 8;
paulo@0 69 new_tokens = realloc (ts->data, ts->data_len * sizeof (uint32_t));
paulo@0 70
paulo@0 71 assert (new_tokens != NULL);
paulo@0 72 ts->data = new_tokens;
paulo@0 73 }
paulo@0 74
paulo@0 75 ts->data[ts->len++] = token;
paulo@0 76 }
paulo@0 77
paulo@0 78 /******************************************************************************/
paulo@0 79
paulo@0 80 static List *by_hash (unsigned char *hash, size_t *n)
paulo@0 81 {
paulo@0 82 FileShare *file;
paulo@0 83 char *str;
paulo@0 84 char *urn;
paulo@0 85
paulo@0 86 *n = 0;
paulo@0 87
paulo@0 88 /* ugly that this is so sha1 specific:need a urn abstraction desperately */
paulo@0 89 if (!(str = sha1_string (hash)))
paulo@0 90 return NULL;
paulo@0 91
paulo@0 92 urn = stringf_dup ("urn:sha1:%s", str);
paulo@0 93 free (str);
paulo@0 94
paulo@0 95 if (!(file = gt_share_local_lookup_by_urn (urn)))
paulo@0 96 {
paulo@0 97 free (urn);
paulo@0 98 return NULL;
paulo@0 99 }
paulo@0 100
paulo@0 101 if (LOG_RESULTS)
paulo@0 102 {
paulo@0 103 GT->DBGFN (GT, "Wuh-HOO! Answered a query-by-hash (%s) for (%s)",
paulo@0 104 urn, share_get_hpath (file));
paulo@0 105 }
paulo@0 106
paulo@0 107 *n = 1;
paulo@0 108 free (urn);
paulo@0 109
paulo@0 110 return list_append (NULL, file);
paulo@0 111 }
paulo@0 112
paulo@0 113 /******************************************************************************/
paulo@0 114
paulo@0 115 /* some defines for list_find_custom that work like DS_BREAK, etc. */
paulo@0 116 #define LIST_BREAK 0
paulo@0 117 #define LIST_CONTINUE 1
paulo@0 118
paulo@0 119 static int search_slowly (Share *file, void **cmp)
paulo@0 120 {
paulo@0 121 GtTokenSet *tokens = cmp[0];
paulo@0 122 List **results = cmp[1];
paulo@0 123 int *max_results = cmp[2];
paulo@0 124 int *count = cmp[3];
paulo@0 125 GtShare *share;
paulo@0 126 int i, j;
paulo@0 127 int matched;
paulo@0 128 GtTokenSet *cmp_tokens;
paulo@0 129
paulo@0 130 if (*count >= *max_results)
paulo@0 131 return LIST_BREAK;
paulo@0 132
paulo@0 133 if (!(share = share_get_udata (file, GT->name)))
paulo@0 134 return LIST_CONTINUE;
paulo@0 135
paulo@0 136 cmp_tokens = share->tokens;
paulo@0 137 matched = 0;
paulo@0 138
paulo@0 139 for (i = 0; i < tokens->len; i++)
paulo@0 140 {
paulo@0 141 int old_matched = matched;
paulo@0 142
paulo@0 143 for (j = 0; j < cmp_tokens->len; j++)
paulo@0 144 {
paulo@0 145 if (tokens->data[i] == cmp_tokens->data[j])
paulo@0 146 {
paulo@0 147 matched++;
paulo@0 148 break;
paulo@0 149 }
paulo@0 150 }
paulo@0 151
paulo@0 152 if (matched == old_matched)
paulo@0 153 break;
paulo@0 154 }
paulo@0 155
paulo@0 156 if (matched >= tokens->len)
paulo@0 157 {
paulo@0 158 *results = list_prepend (*results, file);
paulo@0 159 (*count)++;
paulo@0 160 }
paulo@0 161
paulo@0 162 return LIST_CONTINUE;
paulo@0 163 }
paulo@0 164
paulo@0 165 /*
paulo@0 166 * Find the smallest list corresponding to a word in the query
paulo@0 167 * that is in the trie of shares.
paulo@0 168 */
paulo@0 169 static List *find_smallest (char *query)
paulo@0 170 {
paulo@0 171 char *str;
paulo@0 172 char *str0;
paulo@0 173 List *smallest = NULL;
paulo@0 174 size_t smallest_size = 0;
paulo@0 175 char *tok;
paulo@0 176 size_t size;
paulo@0 177
paulo@0 178 if (!(str = str0 = STRDUP (query)))
paulo@0 179 return NULL;
paulo@0 180
paulo@0 181 string_lower (str);
paulo@0 182
paulo@0 183 while ((tok = string_sep_set (&str, QRP_DELIMITERS)))
paulo@0 184 {
paulo@0 185 List *list;
paulo@0 186
paulo@0 187 if (string_isempty (tok))
paulo@0 188 continue;
paulo@0 189
paulo@0 190 if (!(list = trie_lookup (gt_search_trie, tok)))
paulo@0 191 {
paulo@0 192 /* no shares have this token: therefore no match */
paulo@0 193 smallest = NULL;
paulo@0 194 smallest_size = 0;
paulo@0 195 break;
paulo@0 196 }
paulo@0 197
paulo@0 198 if ((size = list_length (list)) < smallest_size || smallest_size == 0)
paulo@0 199 {
paulo@0 200 /* keep this one */
paulo@0 201 smallest_size = size;
paulo@0 202 smallest = list;
paulo@0 203 }
paulo@0 204 }
paulo@0 205
paulo@0 206 free (str0);
paulo@0 207
paulo@0 208 if (LOG_RESULTS)
paulo@0 209 GT->DBGFN (GT, "scanning list of %d size", smallest_size);
paulo@0 210
paulo@0 211 return smallest;
paulo@0 212 }
paulo@0 213
paulo@0 214 /*
paulo@0 215 * Look up the list of all shares for each word in the query in the
paulo@0 216 * Trie of all local shares. Find the keyword with the smallest
paulo@0 217 * list, then iterate the list looking for shares.
paulo@0 218 */
paulo@0 219 static List *by_keyword (char *query, size_t max_results, size_t *n)
paulo@0 220 {
paulo@0 221 GtTokenSet *tokens;
paulo@0 222 List *results = NULL;
paulo@0 223 List *smallest;
paulo@0 224 void *cmp[4];
paulo@0 225
paulo@0 226 if (!query || string_isempty (query))
paulo@0 227 return NULL;
paulo@0 228
paulo@0 229 if (!(tokens = gt_share_tokenize (query)))
paulo@0 230 return NULL;
paulo@0 231
paulo@0 232 cmp[0] = tokens;
paulo@0 233 cmp[1] = &results;
paulo@0 234 cmp[2] = &max_results;
paulo@0 235 cmp[3] = n;
paulo@0 236
paulo@0 237 smallest = find_smallest (query);
paulo@0 238 list_find_custom (smallest, cmp, (ListForeachFunc)search_slowly);
paulo@0 239
paulo@0 240 gt_token_set_free (tokens);
paulo@0 241 return results;
paulo@0 242 }
paulo@0 243
paulo@0 244 List *gt_search_exec (char *query, gt_search_type_t type, void *extended,
paulo@0 245 uint8_t ttl, uint8_t hops)
paulo@0 246 {
paulo@0 247 List *results;
paulo@0 248 int max_res = MAX_RESULTS;
paulo@0 249 double elapsed;
paulo@0 250 unsigned char *hash;
paulo@0 251 size_t n = 0;
paulo@0 252
paulo@0 253 stopwatch_start (search_sw);
paulo@0 254
paulo@0 255 hash = extended;
paulo@0 256
paulo@0 257 switch (type)
paulo@0 258 {
paulo@0 259 case GT_SEARCH_KEYWORD: results = by_keyword (query, max_res, &n); break;
paulo@0 260 case GT_SEARCH_HASH: results = by_hash (hash, &n); break;
paulo@0 261 default: abort ();
paulo@0 262 }
paulo@0 263
paulo@0 264 stopwatch_stop (search_sw);
paulo@0 265 elapsed = stopwatch_elapsed (search_sw, NULL);
paulo@0 266
paulo@0 267 if (LOG_RESULTS)
paulo@0 268 {
paulo@0 269 GT->dbg (GT, "results: [%03d] [%d|%d] %.06fs (%s)", n,
paulo@0 270 ttl, hops, elapsed, query);
paulo@0 271 }
paulo@0 272
paulo@0 273 return results;
paulo@0 274 }
paulo@0 275
paulo@0 276 /******************************************************************************/
paulo@0 277
paulo@0 278 static void add_share (Trie *trie, char *tok, Share *share)
paulo@0 279 {
paulo@0 280 List *list;
paulo@0 281
paulo@0 282 list = trie_lookup (trie, tok);
paulo@0 283
paulo@0 284 /* the share may already be in the list if the filename has
paulo@0 285 * two words that are the same */
paulo@0 286 if (list_find (list, share))
paulo@0 287 return;
paulo@0 288
paulo@0 289 list = list_prepend (list, share);
paulo@0 290
paulo@0 291 trie_remove (trie, tok);
paulo@0 292 trie_insert (trie, tok, list);
paulo@0 293 }
paulo@0 294
paulo@0 295 static void del_share (Trie *trie, char *tok, Share *share)
paulo@0 296 {
paulo@0 297 List *list;
paulo@0 298
paulo@0 299 list = trie_lookup (trie, tok);
paulo@0 300 list = list_remove (list, share);
paulo@0 301
paulo@0 302 trie_remove (trie, tok);
paulo@0 303
paulo@0 304 if (!list)
paulo@0 305 return;
paulo@0 306
paulo@0 307 trie_insert (trie, tok, list);
paulo@0 308 }
paulo@0 309
paulo@0 310 static void search_trie_change (Trie *trie, Share *share, BOOL add)
paulo@0 311 {
paulo@0 312 char *tok;
paulo@0 313 char *str0, *str;
paulo@0 314
paulo@0 315 if (!(str0 = str = STRDUP (share_get_hpath (share))))
paulo@0 316 return;
paulo@0 317
paulo@0 318 string_lower (str);
paulo@0 319
paulo@0 320 while ((tok = string_sep_set (&str, QRP_DELIMITERS)))
paulo@0 321 {
paulo@0 322 if (string_isempty (tok))
paulo@0 323 continue;
paulo@0 324
paulo@0 325 if (add)
paulo@0 326 add_share (trie, tok, share);
paulo@0 327 else
paulo@0 328 del_share (trie, tok, share);
paulo@0 329 }
paulo@0 330
paulo@0 331 free (str0);
paulo@0 332 }
paulo@0 333
paulo@0 334 void gt_search_exec_add (Share *share)
paulo@0 335 {
paulo@0 336 search_trie_change (gt_search_trie, share, TRUE);
paulo@0 337 }
paulo@0 338
paulo@0 339 void gt_search_exec_remove (Share *share)
paulo@0 340 {
paulo@0 341 search_trie_change (gt_search_trie, share, FALSE);
paulo@0 342 }
paulo@0 343
paulo@0 344 void gt_search_exec_sync (void)
paulo@0 345 {
paulo@0 346 #if 0
paulo@0 347 printf ("SHARE TRIE:\n");
paulo@0 348 trie_print (gt_search_trie);
paulo@0 349 #endif
paulo@0 350 }
paulo@0 351
paulo@0 352 void gt_search_exec_init (void)
paulo@0 353 {
paulo@0 354 gt_search_trie = trie_new ();
paulo@0 355
paulo@0 356 search_sw = stopwatch_new (FALSE);
paulo@0 357 }
paulo@0 358
paulo@0 359 void gt_search_exec_cleanup (void)
paulo@0 360 {
paulo@0 361 trie_free (gt_search_trie);
paulo@0 362 gt_search_trie = NULL;
paulo@0 363
paulo@0 364 stopwatch_free (search_sw);
paulo@0 365 search_sw = NULL;
paulo@0 366 }