1
2
3 """Utilities for clue analysis.
4 """
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 import copy
24
25 import Halberd.logger
26
27
28 logger = Halberd.logger.getLogger()
29
30
31
32
33
34
35
36
37
38
39
41 """Study differences between fields.
42
43 @param clues: Clues to analyze.
44 @type clues: C{list}
45
46 @return: Fields which were found to be different among the analyzed clues.
47 @rtype: C{list}
48 """
49 def pairs(num):
50 for i in xrange(num):
51 for j in xrange(num):
52 if i == j:
53 continue
54 yield (i, j)
55
56 import difflib
57
58 different = []
59 for i, j in pairs(len(clues)):
60 one, other = clues[i].headers, clues[j].headers
61 matcher = difflib.SequenceMatcher(None, one, other)
62
63 for tag, alo, ahi, blo, bhi in matcher.get_opcodes():
64 if tag == 'equal':
65 continue
66
67 for name, value in one[alo:ahi] + other[blo:bhi]:
68 different.append(name)
69
70 different.sort()
71 different.reverse()
72
73 return different
74
76 """Tries to detect and ignore MIME fields with ever changing content.
77
78 Some servers might include fields varying with time, randomly, etc. Those
79 fields are likely to alter the clue's digest and interfer with L{analyze},
80 producing many false positives and making the scan useless. This function
81 detects those fields and recalculates each clue's digest so they can be
82 safely analyzed again.
83
84 @param clues: Sequence of clues.
85 @type clues: C{list} or C{tuple}
86 """
87 from Halberd.clues.Clue import Clue
88
89 different = diff_fields(clues)
90
91
92 ignored = []
93 for field in different:
94 method = '_get_' + Clue.normalize(field)
95 if not hasattr(Clue, method):
96 logger.debug('ignoring %s', field)
97 ignored.append(method)
98 setattr(Clue, method, lambda s, f: None)
99
100 for clue in clues:
101 Clue.parse(clue, clue.headers)
102
103 for method in ignored:
104
105
106
107 delattr(Clue, method)
108
109 return clues
110
111
113 """Returns the specified clue's digest.
114
115 This function is usually passed as a parameter for L{classify} so it can
116 separate clues according to their digest (among other fields).
117
118 @return: The digest of a clue's parsed headers.
119 @rtype: C{str}
120 """
121 return clue.info['digest']
122
124 """Finds clusters of clues.
125
126 A cluster is a group of at most C{step} clues which only differ in 1 seconds
127 between each other.
128
129 @param clues: A sequence of clues to analyze
130 @type clues: C{list} or C{tuple}
131
132 @param step: Maximum difference between the time differences of the
133 cluster's clues.
134 @type step: C{int}
135
136 @return: A sequence with merged clusters.
137 @rtype: C{tuple}
138 """
139 def iscluster(clues, num):
140 """Determines if a list of clues form a cluster of the specified size.
141 """
142 assert len(clues) == num
143
144 if abs(clues[0].diff - clues[-1].diff) <= num:
145 return True
146 return False
147
148 def find_cluster(clues, num):
149 if len(clues) >= num:
150 if iscluster(clues[:num], num):
151 return tuple(clues[:num])
152 return ()
153
154 clues = sort_clues(clues)
155
156 invrange = lambda num: [(num - x) for x in range(num)]
157
158 start = 0
159 while True:
160 clues = clues[start:]
161 if not clues:
162 break
163
164 for i in invrange(step):
165 cluster = find_cluster(clues, i)
166 if cluster:
167 yield cluster
168 start = i
169 break
170
172 """Merges a sequence of clues into one.
173
174 A new clue will store the total count of the clues.
175
176 Note that each L{Clue} has a starting count of 1
177
178 >>> a, b, c = Clue(), Clue(), Clue()
179 >>> sum([x.getCount() for x in [a, b, c]])
180 3
181 >>> a.incCount(5), b.incCount(11), c.incCount(23)
182 (None, None, None)
183 >>> merged = merge((a, b, c))
184 >>> merged.getCount()
185 42
186 >>> merged == a
187 True
188
189 @param clues: A sequence containing all the clues to merge into one.
190 @type clues: C{list} or C{tuple}
191
192 @return: The result of merging all the passed clues into one.
193 @rtype: L{Clue}
194 """
195 merged = copy.copy(clues[0])
196 for clue in clues[1:]:
197 merged.incCount(clue.getCount())
198 return merged
199
201 """Classify a sequence according to one or several criteria.
202
203 We store each item into a nested dictionary using the classifiers as key
204 generators (all of them must be callable objects).
205
206 In the following example we classify a list of clues according to their
207 digest and their time difference.
208
209 >>> a, b, c = Clue(), Clue(), Clue()
210 >>> a.diff, b.diff, c.diff = 1, 2, 2
211 >>> a.info['digest'] = 'x'
212 >>> b.info['digest'] = c.info['digest'] = 'y'
213 >>> get_diff = lambda x: x.diff
214 >>> classified = classify([a, b, c], get_digest, get_diff)
215 >>> digests = classified.keys()
216 >>> digests.sort() # We sort these so doctest won't fail.
217 >>> for digest in digests:
218 ... print digest
219 ... for diff in classified[digest].keys():
220 ... print ' ', diff
221 ... for clue in classified[digest][diff]:
222 ... if clue is a: print ' a'
223 ... elif clue is b: print ' b'
224 ... elif clue is c: print ' c'
225 ...
226 x
227 1
228 a
229 y
230 2
231 b
232 c
233
234 @param seq: A sequence to classify.
235 @type seq: C{list} or C{tuple}
236
237 @param classifiers: A sequence of callables which return specific fields of
238 the items contained in L{seq}
239 @type classifiers: C{list} or C{tuple}
240
241 @return: A nested dictionary in which the keys are the fields obtained by
242 applying the classifiers to the items in the specified sequence.
243 @rtype: C{dict}
244 """
245
246 classified = {}
247
248 for item in seq:
249 section = classified
250 for classifier in classifiers[:-1]:
251 assert callable(classifier)
252 section = section.setdefault(classifier(item), {})
253
254
255 last = classifiers[-1]
256 section.setdefault(last(item), []).append(item)
257
258 return classified
259
261 """Returns sections (and their items) from a nested dict.
262
263 See also: L{classify}
264
265 @param classified: Nested dictionary.
266 @type classified: C{dict}
267
268 @param sects: List of results. It should not be specified by the user.
269 @type sects: C{list}
270
271 @return: A list of lists in where each item is a subsection of a nested dictionary.
272 @rtype: C{list}
273 """
274 if sects is None:
275 sects = []
276
277 if isinstance(classified, dict):
278 for key in classified.keys():
279 sections(classified[key], sects)
280 elif isinstance(classified, list):
281 sects.append(classified)
282
283 return sects
284
286 """Computes the differences between the elements of a sequence of integers.
287
288 >>> deltas([-1, 0, 1])
289 [1, 1]
290 >>> deltas([1, 1, 2, 3, 5, 8, 13])
291 [0, 1, 1, 2, 3, 5]
292
293 @param xs: A sequence of integers.
294 @type xs: C{list}
295
296 @return: A list of differences between consecutive elements of L{xs}.
297 @rtype: C{list}
298 """
299 if len(xs) < 2:
300 return []
301 else:
302 return [xs[1] - xs[0]] + deltas(xs[1:])
303
305 """Returns slices of a given sequence separated by the specified indices.
306
307 If we wanted to get the slices necessary to split range(20) in
308 sub-sequences of 5 items each we'd do:
309
310 >>> seq = range(20)
311 >>> indices = [5, 10, 15]
312 >>> for piece in slices(0, indices):
313 ... print seq[piece]
314 [0, 1, 2, 3, 4]
315 [5, 6, 7, 8, 9]
316 [10, 11, 12, 13, 14]
317 [15, 16, 17, 18, 19]
318
319 @param start: Index of the first element of the sequence we want to
320 partition.
321 @type start: C{int}.
322
323 @param xs: Sequence of indexes where 'cuts' must be made.
324 @type xs: C{list}
325
326 @return: A sequence of C{slice} objects suitable for splitting a list as
327 specified.
328 @rtype: C{list} of C{slice}
329 """
330 if xs == []:
331
332 return [slice(start, None)]
333 return [slice(start, xs[0])] + slices(xs[0], xs[1:])
334
336 """Sorts clues according to their time difference.
337 """
338
339
340 tmps = [(x.diff, x) for x in clues]
341 tmps.sort()
342 return [x[1] for x in tmps]
343
344
346 """Detect and merge clues pointing to a proxy cache on the remote end.
347
348 @param clues: Sequence of clues to analyze
349 @type clues: C{list}
350
351 @param maxdelta: Maximum difference allowed between a clue's time
352 difference and the previous one.
353 @type maxdelta: C{int}
354
355 @return: Sequence where all irrelevant clues pointing out to proxy caches
356 have been filtered out.
357 @rtype: C{list}
358 """
359 results = []
360
361
362 get_rtime = lambda c: c._remote
363 classified = classify(clues, get_rtime, get_digest)
364
365 subsections = sections(classified)
366 for cur_clues in subsections:
367 if len(cur_clues) == 1:
368 results.append(cur_clues[0])
369 continue
370
371 cur_clues = sort_clues(cur_clues)
372
373 diffs = [c.diff for c in cur_clues]
374
375
376
377 indices = [idx for idx, delta in enumerate(deltas(diffs))
378 if abs(delta) > maxdelta]
379
380 for piece in slices(0, indices):
381 if cur_clues[piece] == []:
382 break
383 results.append(merge(cur_clues[piece]))
384
385 return results
386
388 """Return a list of unique clues.
389
390 This is needed when merging clues coming from different sources. Clues with
391 the same time diff and digest are not discarded, they are merged into one
392 clue with the aggregated number of hits.
393
394 @param clues: A sequence containing the clues to analyze.
395 @type clues: C{list}
396
397 @return: Filtered sequence of clues where no clue has the same digest and
398 time difference.
399 @rtype: C{list}
400 """
401 results = []
402
403 get_diff = lambda c: c.diff
404 classified = classify(clues, get_digest, get_diff)
405
406 for section in sections(classified):
407 results.append(merge(section))
408
409 return results
410
412 """Compute the total number of hits in a sequence of clues.
413
414 @param clues: Sequence of clues.
415 @type clues: C{list}
416
417 @return: Total hits.
418 @rtype: C{int}
419 """
420 return sum([clue.getCount() for clue in clues])
421
423 """Draw conclusions from the clues obtained during the scanning phase.
424
425 @param clues: Unprocessed clues obtained during the scanning stage.
426 @type clues: C{list}
427
428 @return: Coherent list of clues identifying real web servers.
429 @rtype: C{list}
430 """
431 results = []
432
433 clues = uniq(clues)
434
435 clues = filter_proxies(clues)
436
437 cluesbydigest = classify(clues, get_digest)
438
439 for key in cluesbydigest.keys():
440 for cluster in clusters(cluesbydigest[key]):
441 results.append(merge(cluster))
442
443 return results
444
445
447 """Identify and ignore changing header fields.
448
449 After initial analysis one must check that there aren't as many realservers
450 as obtained clues. If there were it could be a sign of something wrong
451 happening: each clue is different from the others due to one or more MIME
452 header fields which change unexpectedly.
453
454 @param clues: Raw sequence of clues.
455 @type clues: C{list}
456
457 @param analyzed: Result from the first analysis phase.
458 @type analyzed: C{list}
459
460 @param threshold: Minimum clue-to-realserver ratio in order to trigger
461 field inspection.
462 @type threshold: C{float}
463 """
464 def ratio():
465 return len(analyzed) / float(len(clues))
466
467 assert len(clues) > 0
468
469 r = ratio()
470 if r >= threshold:
471 logger.debug('clue-to-realserver ratio is high (%.3f)', r)
472 logger.debug('reanalyzing clues...')
473
474 ignore_changing_fields(clues)
475 analyzed = analyze(clues)
476
477 logger.debug('clue reanalysis done.')
478
479
480
481 if ratio() >= threshold and len(clues) > 10:
482 logger.warn(
483 '''The following results might be incorrect. It could be because the remote
484 host keeps changing its server version string or because halberd didn't have
485 enough samples.''')
486
487 return analyzed
488
489
504
505 if __name__ == '__main__':
506 _test()
507
508
509
510