From 8ea6f69a51333a961a717383fa9e646b0dc1f4c4 Mon Sep 17 00:00:00 2001
From: Ethan Blackburn <eblackb1@slu.edu>
Date: Mon, 7 Jan 2013 23:04:44 -0600
Subject: [PATCH 1/4] fixed content_processor

fixed the content processor so it properly gives 500 words to each
pool. Also, removed punctuation from words
---
 PyCrawler.py         |  2 +-
 content_processor.py | 27 ++++++++++++++++++++-------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/PyCrawler.py b/PyCrawler.py
index b1e8b3e..b82665e 100644
--- a/PyCrawler.py
+++ b/PyCrawler.py
@@ -92,4 +92,4 @@ def crawl():
 	except Exception, e:
 		logger.error("EXCEPTION: %s " % e)
 		traceback.print_exc()
-	
+	
\ No newline at end of file
diff --git a/content_processor.py b/content_processor.py
index 22ce2e3..242a1a4 100644
--- a/content_processor.py
+++ b/content_processor.py
@@ -1,5 +1,5 @@
 from multiprocessing import Pool
-import re, sys, logging
+import re, sys, logging, string
 
 from ready_queue import ready_queue
 
@@ -9,13 +9,21 @@ def rankKeywords(text):
 	invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
 	ranks = {}
 	text = text.split(' ')
+	exclude = set(string.punctuation)
 	for t in text:
+		#remove punctuation if attached to word
+		temp = t
+		t = ''
+		for i in range(len(temp)):
+			if(temp[i] not in exclude):
+				t += temp[i]
+		t = t.strip()
 		if t in invalid_keywords:
 			continue
 		if not ranks.has_key(t):
 			ranks[t] = 1
 		else:
-			ranks[t] += 1
+			ranks[t] += 1 
 	return ranks
 
 def stripPunctuation(text):
@@ -83,13 +91,18 @@ def processBody(self):
 			offset = 0
 			i = 0
 			l = []
-			while True:
+			cont = True
+			while cont:
+				#this divides the text into sets of 500 words
+				#set j to the index of the last letter of the 500th word
 				j = self.findnth(self.text[i:],' ',500)
-				offset += j
+				#if only 500 words or less are left
 				if j == -1:
-					break
-				l.append(self.text[i:j])
-				i = offset + j+1
+					cont = False
+				#Should append a string that contains 500 words for each loop(except the last loop) to l
+				#last loop should append a string with 500 words or less to l
+				l.append(self.text[i:i+j])
+				i += j+1
 			logger.debug("processing with %i threads" % len(l))
 			try:
 				if len(l) == 0:

From 8169ee583a1749af239e4c5830a75515e15f7027 Mon Sep 17 00:00:00 2001
From: Ethan Blackburn <eblackb1@slu.edu>
Date: Mon, 7 Jan 2013 23:09:29 -0600
Subject: [PATCH 2/4] Revert "fixed content_processor"

This reverts commit 8ea6f69a51333a961a717383fa9e646b0dc1f4c4.
---
 PyCrawler.py         |  2 +-
 content_processor.py | 27 +++++++--------------------
 2 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/PyCrawler.py b/PyCrawler.py
index b82665e..b1e8b3e 100644
--- a/PyCrawler.py
+++ b/PyCrawler.py
@@ -92,4 +92,4 @@ def crawl():
 	except Exception, e:
 		logger.error("EXCEPTION: %s " % e)
 		traceback.print_exc()
-	
\ No newline at end of file
+	
diff --git a/content_processor.py b/content_processor.py
index 242a1a4..22ce2e3 100644
--- a/content_processor.py
+++ b/content_processor.py
@@ -1,5 +1,5 @@
 from multiprocessing import Pool
-import re, sys, logging, string
+import re, sys, logging
 
 from ready_queue import ready_queue
 
@@ -9,21 +9,13 @@ def rankKeywords(text):
 	invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
 	ranks = {}
 	text = text.split(' ')
-	exclude = set(string.punctuation)
 	for t in text:
-		#remove punctuation if attached to word
-		temp = t
-		t = ''
-		for i in range(len(temp)):
-			if(temp[i] not in exclude):
-				t += temp[i]
-		t = t.strip()
 		if t in invalid_keywords:
 			continue
 		if not ranks.has_key(t):
 			ranks[t] = 1
 		else:
-			ranks[t] += 1 
+			ranks[t] += 1
 	return ranks
 
 def stripPunctuation(text):
@@ -91,18 +83,13 @@ def processBody(self):
 			offset = 0
 			i = 0
 			l = []
-			cont = True
-			while cont:
-				#this divides the text into sets of 500 words
-				#set j to the index of the last letter of the 500th word
+			while True:
 				j = self.findnth(self.text[i:],' ',500)
-				#if only 500 words or less are left
+				offset += j
 				if j == -1:
-					cont = False
-				#Should append a string that contains 500 words for each loop(except the last loop) to l
-				#last loop should append a string with 500 words or less to l
-				l.append(self.text[i:i+j])
-				i += j+1
+					break
+				l.append(self.text[i:j])
+				i = offset + j+1
 			logger.debug("processing with %i threads" % len(l))
 			try:
 				if len(l) == 0:

From 3d0dcc34862b8d2608416a36e1859420a0276665 Mon Sep 17 00:00:00 2001
From: Ethan Blackburn <eblackb1@slu.edu>
Date: Mon, 7 Jan 2013 23:13:51 -0600
Subject: [PATCH 3/4] fixed content processor

fixed the content processor so it properly gives 500 words to each pool.
Also, removed punctuation from words
---
 content_processor.py | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/content_processor.py b/content_processor.py
index 22ce2e3..63ab058 100644
--- a/content_processor.py
+++ b/content_processor.py
@@ -1,5 +1,5 @@
 from multiprocessing import Pool
-import re, sys, logging
+import re, sys, logging, string
 
 from ready_queue import ready_queue
 
@@ -9,13 +9,21 @@ def rankKeywords(text):
 	invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
 	ranks = {}
 	text = text.split(' ')
+	exclude = set(string.punctuation)
 	for t in text:
+		#remove punctuation if attached to word
+		temp = t
+		t = ''
+		for i in range(len(temp)):
+			if(temp[i] not in exclude):
+				t += temp[i]
+		t = t.strip()
 		if t in invalid_keywords:
 			continue
 		if not ranks.has_key(t):
 			ranks[t] = 1
 		else:
-			ranks[t] += 1
+			ranks[t] += 1 
 	return ranks
 
 def stripPunctuation(text):
@@ -83,13 +91,18 @@ def processBody(self):
 			offset = 0
 			i = 0
 			l = []
-			while True:
+			cont = True
+			while cont:
+				#this divides the text into sets of 500 words
+				#set j to the index of the last letter of the 500th word
 				j = self.findnth(self.text[i:],' ',500)
-				offset += j
+				#if only 500 words or less are left
 				if j == -1:
-					break
-				l.append(self.text[i:j])
-				i = offset + j+1
+					cont = False
+				#Should append a string that contains 500 words for each loop(except the last loop) to l
+				#last loop should append a string with 500 words or less to l
+				l.append(self.text[i:i+j])
+				i += j+1
 			logger.debug("processing with %i threads" % len(l))
 			try:
 				if len(l) == 0:
@@ -136,4 +149,4 @@ def getDataDict(self):
 		for k,v in self.keywords.items():
 			if v < 3:
 				del self.keywords[k]
-		return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}
\ No newline at end of file
+		return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}

From f4e9cdd9314ce632e2ea8cab12c7db59b38945fa Mon Sep 17 00:00:00 2001
From: Cem YILDIZ <cemyildiz@estonya.net>
Date: Mon, 18 Mar 2013 09:40:39 +0200
Subject: [PATCH 4/4] Update query.py

UTF 8 Decode
---
 query.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/query.py b/query.py
index 7d079fc..064f799 100644
--- a/query.py
+++ b/query.py
@@ -56,7 +56,7 @@ def enqueue(self, urls):
 			return False
 		if len(urls) == 0:
 			return True
-		args = [{'address':unicode(u)} for u in urls]
+		args = [{'address':u.decode("utf8")} for u in urls]
 		result = self.connection.execute(self.queue_table.insert(), args)
 		if result:
 			return True
@@ -81,7 +81,7 @@ def dequeue(self):
 		return False
 	
 	def checkCrawled(self, url):
-		s =  select([self.crawl_table]).where(self.crawl_table.c.address == unicode(url))
+		s =  select([self.crawl_table]).where(self.crawl_table.c.address == url.decode("utf8"))
 		result = self.connection.execute(s)
 		if len(result.fetchall()) > 0:
 			result.close()
@@ -116,4 +116,4 @@ def addPage(self, data):
 		return True
 
 	def close(self):
-		self.connection.close()
\ No newline at end of file
+		self.connection.close()