Package Halberd :: Package clues :: Module analysis
[hide private]
[frames] | no frames]

Source Code for Module Halberd.clues.analysis

  1  # -*- coding: iso-8859-1 -*- 
  2   
  3  """Utilities for clue analysis. 
  4  """ 
  5   
  6  # Copyright (C) 2004, 2005, 2006, 2010  Juan M. Bello Rivas <jmbr@superadditive.com> 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22   
 23  import copy 
 24   
 25  import Halberd.logger 
 26   
 27   
 28  logger = Halberd.logger.getLogger() 
 29   
 30   
 31  # TODO - Test fuzzy clustering and k-means against this naive hierarchical 
 32  # clustering algorithm to see which one performs better (there's a k-means 
 33  # implementation in Scipy). 
 34  # Fuzzy clustering will probably be better as it can output a degree of 
 35  # confidence which might be helpful to halberd's users. 
 36   
 37  # XXX - In python 2.4 there's itertools.groupby() which replaces the idiomatic 
 38  # dictionary uses for grouping things together. 
 39   
40 -def diff_fields(clues):
41 """Study differences between fields. 42 43 @param clues: Clues to analyze. 44 @type clues: C{list} 45 46 @return: Fields which were found to be different among the analyzed clues. 47 @rtype: C{list} 48 """ 49 def pairs(num): 50 for i in xrange(num): 51 for j in xrange(num): 52 if i == j: 53 continue 54 yield (i, j)
55 56 import difflib 57 58 different = [] 59 for i, j in pairs(len(clues)): 60 one, other = clues[i].headers, clues[j].headers 61 matcher = difflib.SequenceMatcher(None, one, other) 62 63 for tag, alo, ahi, blo, bhi in matcher.get_opcodes(): 64 if tag == 'equal': 65 continue 66 67 for name, value in one[alo:ahi] + other[blo:bhi]: 68 different.append(name) 69 70 different.sort() 71 different.reverse() 72 73 return different 74
75 -def ignore_changing_fields(clues):
76 """Tries to detect and ignore MIME fields with ever changing content. 77 78 Some servers might include fields varying with time, randomly, etc. Those 79 fields are likely to alter the clue's digest and interfer with L{analyze}, 80 producing many false positives and making the scan useless. This function 81 detects those fields and recalculates each clue's digest so they can be 82 safely analyzed again. 83 84 @param clues: Sequence of clues. 85 @type clues: C{list} or C{tuple} 86 """ 87 from Halberd.clues.Clue import Clue 88 89 different = diff_fields(clues) 90 91 # First alter Clue to be able to cope with the varying fields. 92 ignored = [] 93 for field in different: 94 method = '_get_' + Clue.normalize(field) 95 if not hasattr(Clue, method): 96 logger.debug('ignoring %s', field) 97 ignored.append(method) 98 setattr(Clue, method, lambda s, f: None) 99 100 for clue in clues: 101 Clue.parse(clue, clue.headers) 102 103 for method in ignored: 104 # We want to leave the Clue class as before because a MIME field 105 # causing trouble for the current scan might be the source of precious 106 # information for another scan. 107 delattr(Clue, method) 108 109 return clues
110 111
112 -def get_digest(clue):
113 """Returns the specified clue's digest. 114 115 This function is usually passed as a parameter for L{classify} so it can 116 separate clues according to their digest (among other fields). 117 118 @return: The digest of a clue's parsed headers. 119 @rtype: C{str} 120 """ 121 return clue.info['digest']
122
123 -def clusters(clues, step=3):
124 """Finds clusters of clues. 125 126 A cluster is a group of at most C{step} clues which only differ in 1 seconds 127 between each other. 128 129 @param clues: A sequence of clues to analyze 130 @type clues: C{list} or C{tuple} 131 132 @param step: Maximum difference between the time differences of the 133 cluster's clues. 134 @type step: C{int} 135 136 @return: A sequence with merged clusters. 137 @rtype: C{tuple} 138 """ 139 def iscluster(clues, num): 140 """Determines if a list of clues form a cluster of the specified size. 141 """ 142 assert len(clues) == num 143 144 if abs(clues[0].diff - clues[-1].diff) <= num: 145 return True 146 return False
147 148 def find_cluster(clues, num): 149 if len(clues) >= num: 150 if iscluster(clues[:num], num): 151 return tuple(clues[:num]) 152 return () 153 154 clues = sort_clues(clues) 155 156 invrange = lambda num: [(num - x) for x in range(num)] 157 158 start = 0 159 while True: 160 clues = clues[start:] 161 if not clues: 162 break 163 164 for i in invrange(step): 165 cluster = find_cluster(clues, i) 166 if cluster: 167 yield cluster 168 start = i 169 break 170
171 -def merge(clues):
172 """Merges a sequence of clues into one. 173 174 A new clue will store the total count of the clues. 175 176 Note that each L{Clue} has a starting count of 1 177 178 >>> a, b, c = Clue(), Clue(), Clue() 179 >>> sum([x.getCount() for x in [a, b, c]]) 180 3 181 >>> a.incCount(5), b.incCount(11), c.incCount(23) 182 (None, None, None) 183 >>> merged = merge((a, b, c)) 184 >>> merged.getCount() 185 42 186 >>> merged == a 187 True 188 189 @param clues: A sequence containing all the clues to merge into one. 190 @type clues: C{list} or C{tuple} 191 192 @return: The result of merging all the passed clues into one. 193 @rtype: L{Clue} 194 """ 195 merged = copy.copy(clues[0]) 196 for clue in clues[1:]: 197 merged.incCount(clue.getCount()) 198 return merged
199
200 -def classify(seq, *classifiers):
201 """Classify a sequence according to one or several criteria. 202 203 We store each item into a nested dictionary using the classifiers as key 204 generators (all of them must be callable objects). 205 206 In the following example we classify a list of clues according to their 207 digest and their time difference. 208 209 >>> a, b, c = Clue(), Clue(), Clue() 210 >>> a.diff, b.diff, c.diff = 1, 2, 2 211 >>> a.info['digest'] = 'x' 212 >>> b.info['digest'] = c.info['digest'] = 'y' 213 >>> get_diff = lambda x: x.diff 214 >>> classified = classify([a, b, c], get_digest, get_diff) 215 >>> digests = classified.keys() 216 >>> digests.sort() # We sort these so doctest won't fail. 217 >>> for digest in digests: 218 ... print digest 219 ... for diff in classified[digest].keys(): 220 ... print ' ', diff 221 ... for clue in classified[digest][diff]: 222 ... if clue is a: print ' a' 223 ... elif clue is b: print ' b' 224 ... elif clue is c: print ' c' 225 ... 226 x 227 1 228 a 229 y 230 2 231 b 232 c 233 234 @param seq: A sequence to classify. 235 @type seq: C{list} or C{tuple} 236 237 @param classifiers: A sequence of callables which return specific fields of 238 the items contained in L{seq} 239 @type classifiers: C{list} or C{tuple} 240 241 @return: A nested dictionary in which the keys are the fields obtained by 242 applying the classifiers to the items in the specified sequence. 243 @rtype: C{dict} 244 """ 245 # XXX - Printing a dictionary in a doctest string is a very bad idea. 246 classified = {} 247 248 for item in seq: 249 section = classified 250 for classifier in classifiers[:-1]: 251 assert callable(classifier) 252 section = section.setdefault(classifier(item), {}) 253 254 # At the end no more dict nesting is needed. We simply store the items. 255 last = classifiers[-1] 256 section.setdefault(last(item), []).append(item) 257 258 return classified
259
260 -def sections(classified, sects=None):
261 """Returns sections (and their items) from a nested dict. 262 263 See also: L{classify} 264 265 @param classified: Nested dictionary. 266 @type classified: C{dict} 267 268 @param sects: List of results. It should not be specified by the user. 269 @type sects: C{list} 270 271 @return: A list of lists in where each item is a subsection of a nested dictionary. 272 @rtype: C{list} 273 """ 274 if sects is None: 275 sects = [] 276 277 if isinstance(classified, dict): 278 for key in classified.keys(): 279 sections(classified[key], sects) 280 elif isinstance(classified, list): 281 sects.append(classified) 282 283 return sects
284
285 -def deltas(xs):
286 """Computes the differences between the elements of a sequence of integers. 287 288 >>> deltas([-1, 0, 1]) 289 [1, 1] 290 >>> deltas([1, 1, 2, 3, 5, 8, 13]) 291 [0, 1, 1, 2, 3, 5] 292 293 @param xs: A sequence of integers. 294 @type xs: C{list} 295 296 @return: A list of differences between consecutive elements of L{xs}. 297 @rtype: C{list} 298 """ 299 if len(xs) < 2: 300 return [] 301 else: 302 return [xs[1] - xs[0]] + deltas(xs[1:])
303
304 -def slices(start, xs):
305 """Returns slices of a given sequence separated by the specified indices. 306 307 If we wanted to get the slices necessary to split range(20) in 308 sub-sequences of 5 items each we'd do: 309 310 >>> seq = range(20) 311 >>> indices = [5, 10, 15] 312 >>> for piece in slices(0, indices): 313 ... print seq[piece] 314 [0, 1, 2, 3, 4] 315 [5, 6, 7, 8, 9] 316 [10, 11, 12, 13, 14] 317 [15, 16, 17, 18, 19] 318 319 @param start: Index of the first element of the sequence we want to 320 partition. 321 @type start: C{int}. 322 323 @param xs: Sequence of indexes where 'cuts' must be made. 324 @type xs: C{list} 325 326 @return: A sequence of C{slice} objects suitable for splitting a list as 327 specified. 328 @rtype: C{list} of C{slice} 329 """ 330 if xs == []: 331 # The last slice includes all the remaining items in the sequence. 332 return [slice(start, None)] 333 return [slice(start, xs[0])] + slices(xs[0], xs[1:])
334
335 -def sort_clues(clues):
336 """Sorts clues according to their time difference. 337 """ 338 # This can be accomplished in newer (>= 2.4) Python versions using: 339 # clues.sort(key=lambda x: x.diff) 340 tmps = [(x.diff, x) for x in clues] 341 tmps.sort() 342 return [x[1] for x in tmps]
343 344
345 -def filter_proxies(clues, maxdelta=3):
346 """Detect and merge clues pointing to a proxy cache on the remote end. 347 348 @param clues: Sequence of clues to analyze 349 @type clues: C{list} 350 351 @param maxdelta: Maximum difference allowed between a clue's time 352 difference and the previous one. 353 @type maxdelta: C{int} 354 355 @return: Sequence where all irrelevant clues pointing out to proxy caches 356 have been filtered out. 357 @rtype: C{list} 358 """ 359 results = [] 360 361 # Classify clues by remote time and digest. 362 get_rtime = lambda c: c._remote 363 classified = classify(clues, get_rtime, get_digest) 364 365 subsections = sections(classified) 366 for cur_clues in subsections: 367 if len(cur_clues) == 1: 368 results.append(cur_clues[0]) 369 continue 370 371 cur_clues = sort_clues(cur_clues) 372 373 diffs = [c.diff for c in cur_clues] 374 375 # We find the indices of those clues which differ from the rest in 376 # more than maxdelta seconds. 377 indices = [idx for idx, delta in enumerate(deltas(diffs)) 378 if abs(delta) > maxdelta] 379 380 for piece in slices(0, indices): 381 if cur_clues[piece] == []: 382 break 383 results.append(merge(cur_clues[piece])) 384 385 return results
386
387 -def uniq(clues):
388 """Return a list of unique clues. 389 390 This is needed when merging clues coming from different sources. Clues with 391 the same time diff and digest are not discarded, they are merged into one 392 clue with the aggregated number of hits. 393 394 @param clues: A sequence containing the clues to analyze. 395 @type clues: C{list} 396 397 @return: Filtered sequence of clues where no clue has the same digest and 398 time difference. 399 @rtype: C{list} 400 """ 401 results = [] 402 403 get_diff = lambda c: c.diff 404 classified = classify(clues, get_digest, get_diff) 405 406 for section in sections(classified): 407 results.append(merge(section)) 408 409 return results
410
411 -def hits(clues):
412 """Compute the total number of hits in a sequence of clues. 413 414 @param clues: Sequence of clues. 415 @type clues: C{list} 416 417 @return: Total hits. 418 @rtype: C{int} 419 """ 420 return sum([clue.getCount() for clue in clues])
421
422 -def analyze(clues):
423 """Draw conclusions from the clues obtained during the scanning phase. 424 425 @param clues: Unprocessed clues obtained during the scanning stage. 426 @type clues: C{list} 427 428 @return: Coherent list of clues identifying real web servers. 429 @rtype: C{list} 430 """ 431 results = [] 432 433 clues = uniq(clues) 434 435 clues = filter_proxies(clues) 436 437 cluesbydigest = classify(clues, get_digest) 438 439 for key in cluesbydigest.keys(): 440 for cluster in clusters(cluesbydigest[key]): 441 results.append(merge(cluster)) 442 443 return results
444 445 # TODO - reanalyze should be called from this module and not from Halberd.shell.
446 -def reanalyze(clues, analyzed, threshold):
447 """Identify and ignore changing header fields. 448 449 After initial analysis one must check that there aren't as many realservers 450 as obtained clues. If there were it could be a sign of something wrong 451 happening: each clue is different from the others due to one or more MIME 452 header fields which change unexpectedly. 453 454 @param clues: Raw sequence of clues. 455 @type clues: C{list} 456 457 @param analyzed: Result from the first analysis phase. 458 @type analyzed: C{list} 459 460 @param threshold: Minimum clue-to-realserver ratio in order to trigger 461 field inspection. 462 @type threshold: C{float} 463 """ 464 def ratio(): 465 return len(analyzed) / float(len(clues))
466 467 assert len(clues) > 0 468 469 r = ratio() 470 if r >= threshold: 471 logger.debug('clue-to-realserver ratio is high (%.3f)', r) 472 logger.debug('reanalyzing clues...') 473 474 ignore_changing_fields(clues) 475 analyzed = analyze(clues) 476 477 logger.debug('clue reanalysis done.') 478 479 # Check again to see if we solved the problem but only warn the user if 480 # there's a significant amount of evidence. 481 if ratio() >= threshold and len(clues) > 10: 482 logger.warn( 483 '''The following results might be incorrect. It could be because the remote 484 host keeps changing its server version string or because halberd didn't have 485 enough samples.''') 486 487 return analyzed 488 489
490 -def _test():
491 import doctest 492 493 import Halberd.clues.Clue 494 import Halberd.clues.analysis 495 496 # Due to the above imports, this test must be executed from the top level 497 # source directory: 498 # python Halberd/clues/analysis.py -v 499 500 globs = Halberd.clues.analysis.__dict__ 501 globs.update(Halberd.clues.Clue.__dict__) 502 503 return doctest.testmod(m=Halberd.clues.analysis, name='analysis', globs=globs)
504 505 if __name__ == '__main__': 506 _test() 507 508 509 # vim: ts=4 sw=4 et 510