Skip to content

Commit 3d0dcc3

Browse files
author
Ethan Blackburn
committed
fixed content processor
fixed the content processor so it properly gives 500 words to each pool. Also, removed punctuation from words
1 parent 8169ee5 commit 3d0dcc3

File tree

1 file changed

+21
-8
lines changed

1 file changed

+21
-8
lines changed

content_processor.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from multiprocessing import Pool
2-
import re, sys, logging
2+
import re, sys, logging, string
33

44
from ready_queue import ready_queue
55

@@ -9,13 +9,21 @@ def rankKeywords(text):
99
invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
1010
ranks = {}
1111
text = text.split(' ')
12+
exclude = set(string.punctuation)
1213
for t in text:
14+
#remove punctuation if attached to word
15+
temp = t
16+
t = ''
17+
for i in range(len(temp)):
18+
if(temp[i] not in exclude):
19+
t += temp[i]
20+
t = t.strip()
1321
if t in invalid_keywords:
1422
continue
1523
if not ranks.has_key(t):
1624
ranks[t] = 1
1725
else:
18-
ranks[t] += 1
26+
ranks[t] += 1
1927
return ranks
2028

2129
def stripPunctuation(text):
@@ -83,13 +91,18 @@ def processBody(self):
8391
offset = 0
8492
i = 0
8593
l = []
86-
while True:
94+
cont = True
95+
while cont:
96+
#this divides the text into sets of 500 words
97+
#set j to the index of the last letter of the 500th word
8798
j = self.findnth(self.text[i:],' ',500)
88-
offset += j
99+
#if only 500 words or less are left
89100
if j == -1:
90-
break
91-
l.append(self.text[i:j])
92-
i = offset + j+1
101+
cont = False
102+
#Should append a string that contains 500 words for each loop(except the last loop) to l
103+
#last loop should append a string with 500 words or less to l
104+
l.append(self.text[i:i+j])
105+
i += j+1
93106
logger.debug("processing with %i threads" % len(l))
94107
try:
95108
if len(l) == 0:
@@ -136,4 +149,4 @@ def getDataDict(self):
136149
for k,v in self.keywords.items():
137150
if v < 3:
138151
del self.keywords[k]
139-
return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}
152+
return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}

0 commit comments

Comments
 (0)