#!/usr/bin/python
# Processes a file of the format:
# customer identifier \t URL
# and determines the most common sequences of 3-URLs hit
import re
# Maps customer ID to the last few URLs.
customerToURLs = {}
# Maps the URL-triplets to the number of times we've seen it.
tripletCounts = {}
# Keeps track of the best N entries.
best = []
def insertSortedToBest( key, count ):
""" Inserts a key/count pair into a list.
Uses the count (in the second slot of the tuple) to determine which
position in the list (if any) this item should go, and inserts it.
Maintains the list at 10 elements."""
pos = findPositionInBest(count)
if count == -1 :
return
old = [key, count - 1]
new = [key, count]
best.insert(pos, new)
if best.count(old) > 0:
best.remove(old)
if len(best) == 11:
best.remove( best[10] )
def findPositionInBest( count ):
""" Determines the appropriate position for an element to be inserted
into our best list."""
if len(best) == 0:
return 0
i = 0
while i < len(best) and count < best[i][1]:
i = i + 1
if i > 9:
return -1
else:
return i
def processEntry( customer, url ):
""" Entirely processes a customer/URL pair. """
if customerToURLs.has_key(customer):
sequence = customerToURLs[customer]
sequence.append(url)
# If we have enough elements already,
# remove the earliest one from the list
if len(sequence) == 4:
sequence.pop(0)
# If we have enough elements, create a triplet
# and increment or set it in the triplet counts
if len(sequence) == 3:
a, b,c = sequence
key = a + ":" + b + ":" + c
if tripletCounts.has_key(key):
tripletCounts[key] = tripletCounts[key] + 1
insertSortedToBest(key, tripletCounts[key])
else:
tripletCounts[key] = 1
insertSortedToBest(key, 1)
else:
customerToURLs[customer] = [url]
r = re.compile(r'\t');
f = open("c:\\data", "r")
for line in f:
customer, url = r.split(line)
processEntry(customer, url)
f.close()
print "====\nSolution:"
print best