diff src/gt_stats.c @ 0:d39e1d0d75b6

initial add
author paulo@hit-nxdomain.opendns.com
date Sat, 20 Feb 2010 21:18:28 -0800
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/gt_stats.c	Sat Feb 20 21:18:28 2010 -0800
     1.3 @@ -0,0 +1,303 @@
     1.4 +/*
     1.5 + * $Id: gt_stats.c,v 1.21 2005/01/04 14:59:58 mkern Exp $
     1.6 + *
     1.7 + * Copyright (C) 2001-2003 giFT project (gift.sourceforge.net)
     1.8 + *
     1.9 + * This program is free software; you can redistribute it and/or modify it
    1.10 + * under the terms of the GNU General Public License as published by the
    1.11 + * Free Software Foundation; either version 2, or (at your option) any
    1.12 + * later version.
    1.13 + *
    1.14 + * This program is distributed in the hope that it will be useful, but
    1.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
    1.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.17 + * General Public License for more details.
    1.18 + */
    1.19 +
    1.20 +#include "gt_gnutella.h"
    1.21 +
    1.22 +#include "gt_node.h"
    1.23 +#include "gt_node_list.h"
    1.24 +
    1.25 +#include "gt_search.h"
    1.26 +
    1.27 +/*****************************************************************************/
    1.28 +
    1.29 +struct gt_stats
    1.30 +{
    1.31 +	double          size_kb;
    1.32 +	unsigned long   files;
    1.33 +	size_t          users;
    1.34 +};
    1.35 +
    1.36 +/* number of stats samples to accumulate */
    1.37 +#define NR_SAMPLES         64
    1.38 +
    1.39 +/* number of samples to take surrounding the median */
    1.40 +#define NR_MEDIAN_SAMPLES  2
    1.41 +
    1.42 +/*****************************************************************************/
    1.43 +
    1.44 +/*
    1.45 + * This should be computed from our neighboring peers, but for now it
    1.46 + * hardcodes the max leaves for the most numerous nodes on the network,
    1.47 + * Limewire (32) and BearShare (10)
    1.48 + */
    1.49 +static int              avg_leaves     = (32 + 10) / 2;
    1.50 +
    1.51 +/* default ttl we use in searches */
    1.52 +static int              default_ttl    = GT_SEARCH_TTL;
    1.53 +
    1.54 +/* default outdegree for most nodes */
    1.55 +static int              default_degree = 6;
    1.56 +
    1.57 +/* keep track of the stats for the last NR_SAMPLES pongs */
    1.58 +static struct gt_stats  samples[NR_SAMPLES];
    1.59 +static size_t           samples_index;
    1.60 +static size_t           samples_count;
    1.61 +
    1.62 +/*****************************************************************************/
    1.63 +
    1.64 +void gt_stats_accumulate (in_addr_t ipv4, in_port_t port,
    1.65 +                          in_addr_t src_ip, uint32_t files,
    1.66 +                          uint32_t size_kb)
    1.67 +{
    1.68 +	samples[samples_index].files   = files;
    1.69 +	samples[samples_index].size_kb = size_kb;
    1.70 +
    1.71 +	samples_index = (samples_index + 1) % NR_SAMPLES;
    1.72 +	samples_count++;
    1.73 +
    1.74 +	if (samples_count > NR_SAMPLES)
    1.75 +		samples_count = NR_SAMPLES;
    1.76 +}
    1.77 +
    1.78 +static int stats_cmp (const void *a, const void *b)
    1.79 +{
    1.80 +	const struct gt_stats *a_s = a;
    1.81 +	const struct gt_stats *b_s = b;
    1.82 +
    1.83 +	return a_s->size_kb - b_s->size_kb;
    1.84 +}
    1.85 +
    1.86 +static void clear_stats (struct gt_stats *stats)
    1.87 +{
    1.88 +	stats->size_kb = 0.0;
    1.89 +	stats->files   = 0;
    1.90 +	stats->users   = 0;
    1.91 +}
    1.92 +
    1.93 +static void get_median_stats (struct gt_stats *pong_stats, size_t nr)
    1.94 +{
    1.95 +	int low, high;
    1.96 +	int mid;
    1.97 +	int i;
    1.98 +
    1.99 +	if (nr == 0)
   1.100 +		return;
   1.101 +
   1.102 +	mid  = nr / 2;
   1.103 +
   1.104 +	low  = mid - NR_MEDIAN_SAMPLES;
   1.105 +	high = mid + NR_MEDIAN_SAMPLES;
   1.106 +
   1.107 +	if (low < 0)
   1.108 +		low = 0;
   1.109 +
   1.110 +	if (high > nr - 1)
   1.111 +		high = nr - 1;
   1.112 +
   1.113 +	for (i = low; i <= high; i++)
   1.114 +	{
   1.115 +		pong_stats->size_kb += samples[i].size_kb;
   1.116 +		pong_stats->files   += samples[i].files;
   1.117 +		pong_stats->users++;
   1.118 +	}
   1.119 +}
   1.120 +
   1.121 +static void get_pong_stats (struct gt_stats *pong_stats)
   1.122 +{
   1.123 +	qsort (samples, samples_count, sizeof (struct gt_stats), stats_cmp);
   1.124 +
   1.125 +	clear_stats (pong_stats);
   1.126 +	get_median_stats (pong_stats, samples_count);
   1.127 +}
   1.128 +
   1.129 +/*****************************************************************************/
   1.130 +
   1.131 +static TCPC *count_stats (TCPC *c, GtNode *node, struct gt_stats *st)
   1.132 +{
   1.133 +	/* sanity check */
   1.134 +	if (node->size_kb == (uint32_t)-1 || node->files == (uint32_t)-1)
   1.135 +		return NULL;
   1.136 +
   1.137 +	st->size_kb += node->size_kb;
   1.138 +	st->files   += node->files;
   1.139 +
   1.140 +	if (node->vitality > 0)
   1.141 +		st->users++;
   1.142 +
   1.143 +	return NULL;
   1.144 +}
   1.145 +
   1.146 +static void get_conn_stats (struct gt_stats *conn_stats)
   1.147 +{
   1.148 +	clear_stats (conn_stats);
   1.149 +
   1.150 +	/* grab statistics from the nodes structures */
   1.151 +	gt_conn_foreach (GT_CONN_FOREACH(count_stats), conn_stats,
   1.152 +	                 GT_NODE_NONE, GT_NODE_ANY, 0);
   1.153 +}
   1.154 +
   1.155 +/*****************************************************************************/
   1.156 +
   1.157 +/* no need to pull in math.h for this simple thing: exponent is small */
   1.158 +static unsigned long int_pow (int base, int exponent)
   1.159 +{
   1.160 +	unsigned long total = 1;
   1.161 +
   1.162 +	while (exponent-- > 0)
   1.163 +		total *= base;
   1.164 +
   1.165 +	return total;
   1.166 +}
   1.167 +
   1.168 +static unsigned long sum_network (int degree, int ttl)
   1.169 +{
   1.170 +	int            i;
   1.171 +	unsigned long  sum;
   1.172 +
   1.173 +	if (ttl <= 0)
   1.174 +		return 0;
   1.175 +
   1.176 +	sum = degree;
   1.177 +
   1.178 +	for (i = 2; i <= ttl; i++)
   1.179 +		sum += int_pow (degree-1, i-1) * degree;
   1.180 +
   1.181 +	return sum;
   1.182 +}
   1.183 +
   1.184 +/*
   1.185 + * For each node we're connected to, check the 'X-Degree' and 'X-Max-TTL'
   1.186 + * handshake headers to use for degree and ttl in the calculations.
   1.187 + */
   1.188 +static GtNode *count_edges (TCPC *c, GtNode *node, void *udata)
   1.189 +{
   1.190 +	unsigned long *count         = udata;
   1.191 +	unsigned long  degree        = 0;
   1.192 +	unsigned long  ttl           = 0;
   1.193 +	char          *max_ttl_str;
   1.194 +	char          *x_degree_str;
   1.195 +
   1.196 +	if ((max_ttl_str = dataset_lookupstr (node->hdr, "x-max-ttl")) != NULL)
   1.197 +		ttl = ATOUL (max_ttl_str);
   1.198 +
   1.199 +	if ((x_degree_str = dataset_lookupstr (node->hdr, "x-degree")) != NULL)
   1.200 +		degree = ATOUL (x_degree_str);
   1.201 +
   1.202 +	if (degree == 0 || degree > 200)
   1.203 +		degree = default_degree;
   1.204 +
   1.205 +	if (ttl == 0 || ttl > 30)
   1.206 +		ttl = default_ttl;
   1.207 +
   1.208 +	/* workaround limewire misfeature where it appears X-Max-TTL is set
   1.209 +	 * too high */
   1.210 +	if (degree > 30 && ttl > 5)
   1.211 +		ttl = default_ttl;
   1.212 +
   1.213 +	*count += sum_network (degree, ttl);
   1.214 +	return NULL;
   1.215 +}
   1.216 +
   1.217 +/*
   1.218 + * Of course this is totally inaccurate, and not even dynamic. It
   1.219 + * approximates the maximum number of users in a network of
   1.220 + * Gnutella's structure in a given outdegree and TTL range
   1.221 + * (default_degree, default_ttl). This doesn't represent the whole
   1.222 + * network, and since the network has a significant amount of
   1.223 + * cycles this calculation is likely way too much.
   1.224 + *
   1.225 + * To compensate, we divide by 3. We could make a better guess
   1.226 + * about the number of nodes if we could find the number of cycles
   1.227 + * some way, but this information is not readily available.
   1.228 + */
   1.229 +static unsigned long guess_users (void)
   1.230 +{
   1.231 +	unsigned long users = 0;
   1.232 +
   1.233 +	gt_conn_foreach (count_edges, &users,
   1.234 +	                 GT_NODE_ULTRA, GT_NODE_CONNECTED, 0);
   1.235 +
   1.236 +	/* multiply by the number of leaves */
   1.237 +	users *= avg_leaves;
   1.238 +
   1.239 +	/* divide by average redundancy level. TODO: calculate this dynamically */
   1.240 +	users /= ((3 + 1) / 2);
   1.241 +
   1.242 +	/* divide by 3 to account for cycles */
   1.243 +	users /= 3;
   1.244 +
   1.245 +	/* multiply by 2 because, well...this whole thing is misleading anyway,
   1.246 +	 * and the total number of users is greater than the horizon */
   1.247 +	users *= 2;
   1.248 +
   1.249 +	return users;
   1.250 +}
   1.251 +
   1.252 +/*****************************************************************************/
   1.253 +
   1.254 +/*
   1.255 + * TODO: accumulate statistics on average leaves/cycles on our immediate
   1.256 + *       peers.
   1.257 + */
   1.258 +int gnutella_stats (Protocol *p, unsigned long *users, unsigned long *files,
   1.259 +                    double *size, Dataset **extra)
   1.260 +{
   1.261 +	struct gt_stats pong_stats;
   1.262 +	struct gt_stats conn_stats;
   1.263 +	struct gt_stats avg_stats;
   1.264 +	size_t          connected;
   1.265 +
   1.266 +	*files = *users = *size = 0;
   1.267 +
   1.268 +	connected = gt_conn_length (GT_NODE_ULTRA, GT_NODE_CONNECTED);
   1.269 +
   1.270 +	if (connected == 0)
   1.271 +		return 0;
   1.272 +
   1.273 +	get_pong_stats (&pong_stats);
   1.274 +	get_conn_stats (&conn_stats);
   1.275 +
   1.276 +	if (conn_stats.users == 0)
   1.277 +		conn_stats.users = 1;
   1.278 +
   1.279 +	if (pong_stats.users == 0)
   1.280 +		pong_stats.users = 1;
   1.281 +
   1.282 +	/* divide the total files size by two since it is inflated by ultrapeers
   1.283 +	 * abusing it to communicate their ultrapeer-ness to other nodes */
   1.284 +	pong_stats.size_kb /= 2;
   1.285 +	conn_stats.size_kb /= 2;
   1.286 +
   1.287 +	/* find the average of the data for our two sources */
   1.288 +	pong_stats.size_kb /= pong_stats.users;
   1.289 +	pong_stats.files   /= pong_stats.users;
   1.290 +	conn_stats.size_kb /= conn_stats.users;
   1.291 +	conn_stats.files   /= conn_stats.users;
   1.292 +
   1.293 +	/* put the stats of previously connected nodes on equal footing with the
   1.294 +	 * stats collected from pongs -- they should be more reliable */
   1.295 +	avg_stats.files   = (pong_stats.files + conn_stats.files) / 2;
   1.296 +	avg_stats.size_kb = (pong_stats.size_kb + conn_stats.size_kb) / 2;
   1.297 +
   1.298 +	/* add conn_stats.users for a little variation (not much) */
   1.299 +	avg_stats.users = guess_users () + conn_stats.users;
   1.300 +
   1.301 +	*users = avg_stats.users;
   1.302 +	*files = avg_stats.files * avg_stats.users;
   1.303 +	*size  = avg_stats.size_kb * avg_stats.users / 1024 / 1024;
   1.304 +
   1.305 +	return connected;
   1.306 +}