import urllib import string import rfc822 import cStringIO import time def make_xlatter(): alphanumeric=string.letters+string.digits source_string="" destination_string="" for i in range(256): source_string=source_string+chr(i) if chr(i) in alphanumeric: destination_string=destination_string+string.upper(chr(i)) else: destination_string=destination_string+' ' return string.maketrans(source_string,destination_string) def simpleParser(text): global xlatter filtered_text=string.translate(text,xlatter) tokens=string.split(filtered_text,' ') symbols={} for token in tokens: if len(token)>0: if symbols.has_key(token): symbols[token]=symbols[token]+1 else: symbols[token]=1 return symbols class Document: def __init__(self,name,data=None): self.name=name self.attributes={'doctype':'basic'} self.terms={} self.data=data if self.data: self.parse() def parse(self): pass class TextDocument(Document): def __init__(self,name,data): Document.__init__(self,name,data) self.attributes['doctype']='text' def parse(self): self.terms=simpleParser(self.data) class RFC822Document(Document): def __init__(self,name,data=None): Document.__init__(self,name,data) self.attributes['doctype']='rfc822' def parse(self): S=cStringIO.StringIO(self.data) S.seek(0) M=rfc822.Message(S) #-- Get the interesting rfc822 attributes. subject=M.getheader('subject') date=M.getheader('date') to_list=M.getaddrlist('to') sender_name,sender_address=M.getaddr('from') if subject: self.attributes['subject']=subject if date: self.attributes['date']=date xdate=M.getdate('date') self.attributes['xdate']=time.strftime('%Y%m%d', xdate) if len(to_list)>0: self.attributes['to']="%s,%s"%(to_list[0]) if sender_name or sender_address: self.attributes['from']="%s,%s"%(sender_name,sender_address) #-- Now read the body. M.rewindbody() body_text=M.fp.read() self.terms=simpleParser(body_text) #-- Factory function to create a new document of the appropriate type. def newDocument(doctype,name,data): if doctype=='rfc822': return RFC822Document(name,data) if doctype=='text': return TextDocument(name,data) if doctype=='basic': return Document(name,data) xlatter=make_xlatter()