Skip to content

Commit 8169ee5

Browse files
author
Ethan Blackburn
committed
Revert "fixed content_processor"
This reverts commit 8ea6f69.
1 parent 8ea6f69 commit 8169ee5

File tree

2 files changed

+8
-21
lines changed

2 files changed

+8
-21
lines changed

PyCrawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,4 +92,4 @@ def crawl():
9292
except Exception, e:
9393
logger.error("EXCEPTION: %s " % e)
9494
traceback.print_exc()
95-
95+

content_processor.py

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from multiprocessing import Pool
2-
import re, sys, logging, string
2+
import re, sys, logging
33

44
from ready_queue import ready_queue
55

@@ -9,21 +9,13 @@ def rankKeywords(text):
99
invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
1010
ranks = {}
1111
text = text.split(' ')
12-
exclude = set(string.punctuation)
1312
for t in text:
14-
#remove punctuation if attached to word
15-
temp = t
16-
t = ''
17-
for i in range(len(temp)):
18-
if(temp[i] not in exclude):
19-
t += temp[i]
20-
t = t.strip()
2113
if t in invalid_keywords:
2214
continue
2315
if not ranks.has_key(t):
2416
ranks[t] = 1
2517
else:
26-
ranks[t] += 1
18+
ranks[t] += 1
2719
return ranks
2820

2921
def stripPunctuation(text):
@@ -91,18 +83,13 @@ def processBody(self):
9183
offset = 0
9284
i = 0
9385
l = []
94-
cont = True
95-
while cont:
96-
#this divides the text into sets of 500 words
97-
#set j to the index of the last letter of the 500th word
86+
while True:
9887
j = self.findnth(self.text[i:],' ',500)
99-
#if only 500 words or less are left
88+
offset += j
10089
if j == -1:
101-
cont = False
102-
#Should append a string that contains 500 words for each loop(except the last loop) to l
103-
#last loop should append a string with 500 words or less to l
104-
l.append(self.text[i:i+j])
105-
i += j+1
90+
break
91+
l.append(self.text[i:j])
92+
i = offset + j+1
10693
logger.debug("processing with %i threads" % len(l))
10794
try:
10895
if len(l) == 0:

0 commit comments

Comments
 (0)