diff --git a/content_processor.py b/content_processor.py index 22ce2e3..63ab058 100644 --- a/content_processor.py +++ b/content_processor.py @@ -1,5 +1,5 @@ from multiprocessing import Pool -import re, sys, logging +import re, sys, logging, string from ready_queue import ready_queue @@ -9,13 +9,21 @@ def rankKeywords(text): invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"] ranks = {} text = text.split(' ') + exclude = set(string.punctuation) for t in text: + #remove punctuation if attached to word + temp = t + t = '' + for i in range(len(temp)): + if(temp[i] not in exclude): + t += temp[i] + t = t.strip() if t in invalid_keywords: continue if not ranks.has_key(t): ranks[t] = 1 else: - ranks[t] += 1 + ranks[t] += 1 return ranks def stripPunctuation(text): @@ -83,13 +91,18 @@ def processBody(self): offset = 0 i = 0 l = [] - while True: + cont = True + while cont: + #this divides the text into sets of 500 words + #set j to the index of the last letter of the 500th word j = self.findnth(self.text[i:],' ',500) - offset += j + #if only 500 words or less are left if j == -1: - break - l.append(self.text[i:j]) - i = offset + j+1 + cont = False + #Should append a string that contains 500 words for each loop(except the last loop) to l + #last loop should append a string with 500 words or less to l + l.append(self.text[i:i+j]) + i += j+1 logger.debug("processing with %i threads" % len(l)) try: if len(l) == 0: @@ -136,4 +149,4 @@ def getDataDict(self): for k,v in self.keywords.items(): if v < 3: del self.keywords[k] - return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords} \ No newline at end of file + return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords} diff --git a/query.py b/query.py index 7d079fc..064f799 100644 --- a/query.py +++ b/query.py @@ -56,7 +56,7 @@ def enqueue(self, urls): return False if len(urls) == 0: return True - args = [{'address':unicode(u)} for u in urls] + args = [{'address':u.decode("utf8")} for u in urls] result = self.connection.execute(self.queue_table.insert(), args) if result: return True @@ -81,7 +81,7 @@ def dequeue(self): return False def checkCrawled(self, url): - s = select([self.crawl_table]).where(self.crawl_table.c.address == unicode(url)) + s = select([self.crawl_table]).where(self.crawl_table.c.address == url.decode("utf8")) result = self.connection.execute(s) if len(result.fetchall()) > 0: result.close() @@ -116,4 +116,4 @@ def addPage(self, data): return True def close(self): - self.connection.close() \ No newline at end of file + self.connection.close()