diff src/gt_search_exec.c @ 0:d39e1d0d75b6

initial add
author paulo@hit-nxdomain.opendns.com
date Sat, 20 Feb 2010 21:18:28 -0800
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/gt_search_exec.c	Sat Feb 20 21:18:28 2010 -0800
     1.3 @@ -0,0 +1,366 @@
     1.4 +/*
     1.5 + * $Id: gt_search_exec.c,v 1.23 2004/01/04 05:15:28 hipnod Exp $
     1.6 + *
     1.7 + * Copyright (C) 2001-2003 giFT project (gift.sourceforge.net)
     1.8 + *
     1.9 + * This program is free software; you can redistribute it and/or modify it
    1.10 + * under the terms of the GNU General Public License as published by the
    1.11 + * Free Software Foundation; either version 2, or (at your option) any
    1.12 + * later version.
    1.13 + *
    1.14 + * This program is distributed in the hope that it will be useful, but
    1.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
    1.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.17 + * General Public License for more details.
    1.18 + */
    1.19 +
    1.20 +#include "gt_gnutella.h"
    1.21 +
    1.22 +#include "gt_search.h"
    1.23 +#include "gt_search_exec.h"
    1.24 +
    1.25 +#include "gt_query_route.h" /* QRP_DELIMITERS */
    1.26 +#include "gt_share.h"       /* gt_share_local_lookup_by_urn */
    1.27 +#include "gt_share_file.h"
    1.28 +
    1.29 +#include "sha1.h"           /* sha1_string */
    1.30 +#include "trie.h"
    1.31 +
    1.32 +#include <libgift/stopwatch.h>
    1.33 +
    1.34 +/******************************************************************************/
    1.35 +
    1.36 +#define MAX_RESULTS       200
    1.37 +
    1.38 +/******************************************************************************/
    1.39 +
    1.40 +static Trie      *gt_search_trie;
    1.41 +
    1.42 +static StopWatch *search_sw;
    1.43 +
    1.44 +/******************************************************************************/
    1.45 +
    1.46 +GtTokenSet *gt_token_set_new ()
    1.47 +{
    1.48 +	GtTokenSet *ts;
    1.49 +
    1.50 +	if (!(ts = MALLOC (sizeof (GtTokenSet))))
    1.51 +		return NULL;
    1.52 +
    1.53 +	return ts;
    1.54 +}
    1.55 +
    1.56 +void gt_token_set_free (GtTokenSet *ts)
    1.57 +{
    1.58 +	if (!ts)
    1.59 +		return;
    1.60 +
    1.61 +	free (ts->data);
    1.62 +	free (ts);
    1.63 +}
    1.64 +
    1.65 +void gt_token_set_append (GtTokenSet *ts, uint32_t token)
    1.66 +{
    1.67 +	if (ts->len >= ts->data_len)
    1.68 +	{
    1.69 +		uint32_t *new_tokens;
    1.70 +
    1.71 +		ts->data_len += 8;
    1.72 +		new_tokens = realloc (ts->data, ts->data_len * sizeof (uint32_t));
    1.73 +
    1.74 +		assert (new_tokens != NULL);
    1.75 +		ts->data = new_tokens;
    1.76 +	}
    1.77 +
    1.78 +	ts->data[ts->len++] = token;
    1.79 +}
    1.80 +
    1.81 +/******************************************************************************/
    1.82 +
    1.83 +static List *by_hash (unsigned char *hash, size_t *n)
    1.84 +{
    1.85 +	FileShare *file;
    1.86 +	char      *str;
    1.87 +	char      *urn;
    1.88 +
    1.89 +	*n = 0;
    1.90 +
    1.91 +	/* ugly that this is so sha1 specific:need a urn abstraction desperately */
    1.92 +	if (!(str = sha1_string (hash)))
    1.93 +		return NULL;
    1.94 +
    1.95 +	urn = stringf_dup ("urn:sha1:%s", str);
    1.96 +	free (str);
    1.97 +
    1.98 +	if (!(file = gt_share_local_lookup_by_urn (urn)))
    1.99 +	{
   1.100 +		free (urn);
   1.101 +		return NULL;
   1.102 +	}
   1.103 +
   1.104 +	if (LOG_RESULTS)
   1.105 +	{
   1.106 +		GT->DBGFN (GT, "Wuh-HOO! Answered a query-by-hash (%s) for (%s)",
   1.107 +		           urn, share_get_hpath (file));
   1.108 +	}
   1.109 +	
   1.110 +	*n = 1;
   1.111 +	free (urn);
   1.112 +
   1.113 +	return list_append (NULL, file);
   1.114 +}
   1.115 +
   1.116 +/******************************************************************************/
   1.117 +
   1.118 +/* some defines for list_find_custom that work like DS_BREAK, etc. */
   1.119 +#define LIST_BREAK     0
   1.120 +#define LIST_CONTINUE  1
   1.121 +
   1.122 +static int search_slowly (Share *file, void **cmp)
   1.123 +{
   1.124 +	GtTokenSet *tokens       = cmp[0];
   1.125 +	List      **results      = cmp[1];
   1.126 +	int        *max_results  = cmp[2];
   1.127 +	int        *count        = cmp[3];
   1.128 +	GtShare    *share;
   1.129 +	int         i, j;
   1.130 +	int         matched;
   1.131 +	GtTokenSet *cmp_tokens;
   1.132 +
   1.133 +	if (*count >= *max_results)
   1.134 +		return LIST_BREAK;
   1.135 +
   1.136 +	if (!(share = share_get_udata (file, GT->name)))
   1.137 +		return LIST_CONTINUE;
   1.138 +
   1.139 +	cmp_tokens = share->tokens;
   1.140 +	matched    = 0;
   1.141 +
   1.142 +	for (i = 0; i < tokens->len; i++)
   1.143 +	{
   1.144 +		int old_matched = matched;
   1.145 +
   1.146 +		for (j = 0; j < cmp_tokens->len; j++)
   1.147 +		{
   1.148 +			if (tokens->data[i] == cmp_tokens->data[j])
   1.149 +			{
   1.150 +				matched++;
   1.151 +				break;
   1.152 +			}
   1.153 +		}
   1.154 +
   1.155 +		if (matched == old_matched)
   1.156 +			break;
   1.157 +	}
   1.158 +
   1.159 +	if (matched >= tokens->len)
   1.160 +	{
   1.161 +		*results = list_prepend (*results, file);
   1.162 +		(*count)++;
   1.163 +	}
   1.164 +
   1.165 +	return LIST_CONTINUE;
   1.166 +}
   1.167 +
   1.168 +/*
   1.169 + * Find the smallest list corresponding to a word in the query
   1.170 + * that is in the trie of shares.
   1.171 + */
   1.172 +static List *find_smallest (char *query)
   1.173 +{
   1.174 +	char    *str;
   1.175 +	char    *str0;
   1.176 +	List    *smallest      = NULL;
   1.177 +	size_t   smallest_size = 0;
   1.178 +	char    *tok;
   1.179 +	size_t   size;
   1.180 +
   1.181 +	if (!(str = str0 = STRDUP (query)))
   1.182 +		return NULL;
   1.183 +
   1.184 +	string_lower (str);
   1.185 +
   1.186 +	while ((tok = string_sep_set (&str, QRP_DELIMITERS)))
   1.187 +	{
   1.188 +		List *list;
   1.189 +
   1.190 +		if (string_isempty (tok))
   1.191 +			continue;
   1.192 +
   1.193 +		if (!(list = trie_lookup (gt_search_trie, tok)))
   1.194 +		{
   1.195 +			/* no shares have this token: therefore no match */
   1.196 +			smallest      = NULL;
   1.197 +			smallest_size = 0;
   1.198 +			break;
   1.199 +		}
   1.200 +
   1.201 +		if ((size = list_length (list)) < smallest_size || smallest_size == 0)
   1.202 +		{
   1.203 +			/* keep this one */
   1.204 +			smallest_size = size;
   1.205 +			smallest      = list;
   1.206 +		}
   1.207 +	}
   1.208 +
   1.209 +	free (str0);
   1.210 +
   1.211 +	if (LOG_RESULTS)
   1.212 +		GT->DBGFN (GT, "scanning list of %d size", smallest_size);
   1.213 +
   1.214 +	return smallest;
   1.215 +}
   1.216 +
   1.217 +/*
   1.218 + * Look up the list of all shares for each word in the query in the 
   1.219 + * Trie of all local shares. Find the keyword with the smallest
   1.220 + * list, then iterate the list looking for shares.
   1.221 + */
   1.222 +static List *by_keyword (char *query, size_t max_results, size_t *n)
   1.223 +{
   1.224 +	GtTokenSet *tokens;
   1.225 +	List       *results  = NULL;
   1.226 +	List       *smallest;
   1.227 +	void       *cmp[4];
   1.228 +
   1.229 +	if (!query || string_isempty (query))
   1.230 +		return NULL;
   1.231 +
   1.232 +	if (!(tokens = gt_share_tokenize (query)))
   1.233 +		return NULL;
   1.234 +
   1.235 +	cmp[0] = tokens;
   1.236 +	cmp[1] = &results;
   1.237 +	cmp[2] = &max_results;
   1.238 +	cmp[3] = n;
   1.239 +
   1.240 +	smallest = find_smallest (query);
   1.241 +	list_find_custom (smallest, cmp, (ListForeachFunc)search_slowly);
   1.242 +
   1.243 +	gt_token_set_free (tokens);
   1.244 +	return results;
   1.245 +}
   1.246 +
   1.247 +List *gt_search_exec (char *query, gt_search_type_t type, void *extended,
   1.248 +                      uint8_t ttl, uint8_t hops)
   1.249 +{
   1.250 +	List          *results;
   1.251 +	int            max_res     = MAX_RESULTS;
   1.252 +	double         elapsed;
   1.253 +	unsigned char *hash;
   1.254 +	size_t         n           = 0;
   1.255 +
   1.256 +	stopwatch_start (search_sw);
   1.257 +
   1.258 +	hash = extended;
   1.259 +
   1.260 +	switch (type)
   1.261 +	{
   1.262 +	 case GT_SEARCH_KEYWORD: results = by_keyword (query, max_res, &n);  break;
   1.263 +	 case GT_SEARCH_HASH:    results = by_hash    (hash, &n);            break;
   1.264 +	 default: abort ();
   1.265 +	}
   1.266 +
   1.267 +	stopwatch_stop (search_sw);
   1.268 +	elapsed = stopwatch_elapsed (search_sw, NULL);
   1.269 +
   1.270 +	if (LOG_RESULTS)
   1.271 +	{
   1.272 +		GT->dbg (GT, "results: [%03d] [%d|%d] %.06fs (%s)", n,
   1.273 +		         ttl, hops, elapsed, query);
   1.274 +	}
   1.275 +
   1.276 +	return results;
   1.277 +}
   1.278 +
   1.279 +/******************************************************************************/
   1.280 +
   1.281 +static void add_share (Trie *trie, char *tok, Share *share)
   1.282 +{
   1.283 +	List *list;
   1.284 +
   1.285 +	list = trie_lookup (trie, tok);
   1.286 +
   1.287 +	/* the share may already be in the list if the filename has
   1.288 +	 * two words that are the same */
   1.289 +	if (list_find (list, share))
   1.290 +		return;
   1.291 +
   1.292 +	list = list_prepend (list, share);
   1.293 +
   1.294 +	trie_remove (trie, tok);
   1.295 +	trie_insert (trie, tok, list);
   1.296 +}
   1.297 +
   1.298 +static void del_share (Trie *trie, char *tok, Share *share)
   1.299 +{
   1.300 +	List *list;
   1.301 +
   1.302 +	list = trie_lookup (trie, tok);
   1.303 +	list = list_remove (list, share);
   1.304 +
   1.305 +	trie_remove (trie, tok);
   1.306 +
   1.307 +	if (!list)
   1.308 +		return;
   1.309 +
   1.310 +	trie_insert (trie, tok, list);
   1.311 +}
   1.312 +
   1.313 +static void search_trie_change (Trie *trie, Share *share, BOOL add)
   1.314 +{
   1.315 +	char *tok;
   1.316 +	char *str0, *str;
   1.317 +
   1.318 +	if (!(str0 = str = STRDUP (share_get_hpath (share))))
   1.319 +		return;
   1.320 +
   1.321 +	string_lower (str);
   1.322 +
   1.323 +	while ((tok = string_sep_set (&str, QRP_DELIMITERS)))
   1.324 +	{
   1.325 +		if (string_isempty (tok))
   1.326 +			continue;
   1.327 +
   1.328 +		if (add)
   1.329 +			add_share (trie, tok, share);
   1.330 +		else
   1.331 +			del_share (trie, tok, share);
   1.332 +	}
   1.333 +
   1.334 +	free (str0);
   1.335 +}
   1.336 +
   1.337 +void gt_search_exec_add (Share *share)
   1.338 +{
   1.339 +	search_trie_change (gt_search_trie, share, TRUE);
   1.340 +}
   1.341 +
   1.342 +void gt_search_exec_remove (Share *share)
   1.343 +{
   1.344 +	search_trie_change (gt_search_trie, share, FALSE);
   1.345 +}
   1.346 +
   1.347 +void gt_search_exec_sync (void)
   1.348 +{
   1.349 +#if 0
   1.350 +	printf ("SHARE TRIE:\n");
   1.351 +	trie_print (gt_search_trie);
   1.352 +#endif
   1.353 +}
   1.354 +
   1.355 +void gt_search_exec_init (void)
   1.356 +{
   1.357 +	gt_search_trie = trie_new ();
   1.358 +
   1.359 +	search_sw = stopwatch_new (FALSE);
   1.360 +}
   1.361 +
   1.362 +void gt_search_exec_cleanup (void)
   1.363 +{
   1.364 +	trie_free (gt_search_trie);
   1.365 +	gt_search_trie = NULL;
   1.366 +
   1.367 +	stopwatch_free (search_sw);
   1.368 +	search_sw = NULL;
   1.369 +}