From 8ea6f69a51333a961a717383fa9e646b0dc1f4c4 Mon Sep 17 00:00:00 2001 From: Ethan Blackburn Date: Mon, 7 Jan 2013 23:04:44 -0600 Subject: [PATCH 1/4] fixed content_processor fixed the content processor so it properly gives 500 words to each pool. Also, removed punctuation from words --- PyCrawler.py | 2 +- content_processor.py | 27 ++++++++++++++++++++------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/PyCrawler.py b/PyCrawler.py index b1e8b3e..b82665e 100644 --- a/PyCrawler.py +++ b/PyCrawler.py @@ -92,4 +92,4 @@ def crawl(): except Exception, e: logger.error("EXCEPTION: %s " % e) traceback.print_exc() - + \ No newline at end of file diff --git a/content_processor.py b/content_processor.py index 22ce2e3..242a1a4 100644 --- a/content_processor.py +++ b/content_processor.py @@ -1,5 +1,5 @@ from multiprocessing import Pool -import re, sys, logging +import re, sys, logging, string from ready_queue import ready_queue @@ -9,13 +9,21 @@ def rankKeywords(text): invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"] ranks = {} text = text.split(' ') + exclude = set(string.punctuation) for t in text: + #remove punctuation if attached to word + temp = t + t = '' + for i in range(len(temp)): + if(temp[i] not in exclude): + t += temp[i] + t = t.strip() if t in invalid_keywords: continue if not ranks.has_key(t): ranks[t] = 1 else: - ranks[t] += 1 + ranks[t] += 1 return ranks def stripPunctuation(text): @@ -83,13 +91,18 @@ def processBody(self): offset = 0 i = 0 l = [] - while True: + cont = True + while cont: + #this divides the text into sets of 500 words + #set j to the index of the last letter of the 500th word j = self.findnth(self.text[i:],' ',500) - offset += j + #if only 500 words or less are left if j == -1: - break - l.append(self.text[i:j]) - i = offset + j+1 + cont = False + #Should append a string that contains 500 words for each loop(except the last loop) to l + #last loop should append a string with 500 words or less to l + l.append(self.text[i:i+j]) + i += j+1 logger.debug("processing with %i threads" % len(l)) try: if len(l) == 0: From 8169ee583a1749af239e4c5830a75515e15f7027 Mon Sep 17 00:00:00 2001 From: Ethan Blackburn Date: Mon, 7 Jan 2013 23:09:29 -0600 Subject: [PATCH 2/4] Revert "fixed content_processor" This reverts commit 8ea6f69a51333a961a717383fa9e646b0dc1f4c4. --- PyCrawler.py | 2 +- content_processor.py | 27 +++++++-------------------- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/PyCrawler.py b/PyCrawler.py index b82665e..b1e8b3e 100644 --- a/PyCrawler.py +++ b/PyCrawler.py @@ -92,4 +92,4 @@ def crawl(): except Exception, e: logger.error("EXCEPTION: %s " % e) traceback.print_exc() - \ No newline at end of file + diff --git a/content_processor.py b/content_processor.py index 242a1a4..22ce2e3 100644 --- a/content_processor.py +++ b/content_processor.py @@ -1,5 +1,5 @@ from multiprocessing import Pool -import re, sys, logging, string +import re, sys, logging from ready_queue import ready_queue @@ -9,21 +9,13 @@ def rankKeywords(text): invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"] ranks = {} text = text.split(' ') - exclude = set(string.punctuation) for t in text: - #remove punctuation if attached to word - temp = t - t = '' - for i in range(len(temp)): - if(temp[i] not in exclude): - t += temp[i] - t = t.strip() if t in invalid_keywords: continue if not ranks.has_key(t): ranks[t] = 1 else: - ranks[t] += 1 + ranks[t] += 1 return ranks def stripPunctuation(text): @@ -91,18 +83,13 @@ def processBody(self): offset = 0 i = 0 l = [] - cont = True - while cont: - #this divides the text into sets of 500 words - #set j to the index of the last letter of the 500th word + while True: j = self.findnth(self.text[i:],' ',500) - #if only 500 words or less are left + offset += j if j == -1: - cont = False - #Should append a string that contains 500 words for each loop(except the last loop) to l - #last loop should append a string with 500 words or less to l - l.append(self.text[i:i+j]) - i += j+1 + break + l.append(self.text[i:j]) + i = offset + j+1 logger.debug("processing with %i threads" % len(l)) try: if len(l) == 0: From 3d0dcc34862b8d2608416a36e1859420a0276665 Mon Sep 17 00:00:00 2001 From: Ethan Blackburn Date: Mon, 7 Jan 2013 23:13:51 -0600 Subject: [PATCH 3/4] fixed content processor fixed the content processor so it properly gives 500 words to each pool. Also, removed punctuation from words --- content_processor.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/content_processor.py b/content_processor.py index 22ce2e3..63ab058 100644 --- a/content_processor.py +++ b/content_processor.py @@ -1,5 +1,5 @@ from multiprocessing import Pool -import re, sys, logging +import re, sys, logging, string from ready_queue import ready_queue @@ -9,13 +9,21 @@ def rankKeywords(text): invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"] ranks = {} text = text.split(' ') + exclude = set(string.punctuation) for t in text: + #remove punctuation if attached to word + temp = t + t = '' + for i in range(len(temp)): + if(temp[i] not in exclude): + t += temp[i] + t = t.strip() if t in invalid_keywords: continue if not ranks.has_key(t): ranks[t] = 1 else: - ranks[t] += 1 + ranks[t] += 1 return ranks def stripPunctuation(text): @@ -83,13 +91,18 @@ def processBody(self): offset = 0 i = 0 l = [] - while True: + cont = True + while cont: + #this divides the text into sets of 500 words + #set j to the index of the last letter of the 500th word j = self.findnth(self.text[i:],' ',500) - offset += j + #if only 500 words or less are left if j == -1: - break - l.append(self.text[i:j]) - i = offset + j+1 + cont = False + #Should append a string that contains 500 words for each loop(except the last loop) to l + #last loop should append a string with 500 words or less to l + l.append(self.text[i:i+j]) + i += j+1 logger.debug("processing with %i threads" % len(l)) try: if len(l) == 0: @@ -136,4 +149,4 @@ def getDataDict(self): for k,v in self.keywords.items(): if v < 3: del self.keywords[k] - return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords} \ No newline at end of file + return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords} From f4e9cdd9314ce632e2ea8cab12c7db59b38945fa Mon Sep 17 00:00:00 2001 From: Cem YILDIZ Date: Mon, 18 Mar 2013 09:40:39 +0200 Subject: [PATCH 4/4] Update query.py UTF 8 Decode --- query.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/query.py b/query.py index 7d079fc..064f799 100644 --- a/query.py +++ b/query.py @@ -56,7 +56,7 @@ def enqueue(self, urls): return False if len(urls) == 0: return True - args = [{'address':unicode(u)} for u in urls] + args = [{'address':u.decode("utf8")} for u in urls] result = self.connection.execute(self.queue_table.insert(), args) if result: return True @@ -81,7 +81,7 @@ def dequeue(self): return False def checkCrawled(self, url): - s = select([self.crawl_table]).where(self.crawl_table.c.address == unicode(url)) + s = select([self.crawl_table]).where(self.crawl_table.c.address == url.decode("utf8")) result = self.connection.execute(s) if len(result.fetchall()) > 0: result.close() @@ -116,4 +116,4 @@ def addPage(self, data): return True def close(self): - self.connection.close() \ No newline at end of file + self.connection.close()