1
1
from multiprocessing import Pool
2
- import re , sys , logging
2
+ import re , sys , logging , string
3
3
4
4
from ready_queue import ready_queue
5
5
@@ -9,13 +9,21 @@ def rankKeywords(text):
9
9
invalid_keywords = ['' , ' ' , "i" , "a" , "an" , "and" , "the" , "for" , "be" , "to" , "or" , "too" , "also" ]
10
10
ranks = {}
11
11
text = text .split (' ' )
12
+ exclude = set (string .punctuation )
12
13
for t in text :
14
+ #remove punctuation if attached to word
15
+ temp = t
16
+ t = ''
17
+ for i in range (len (temp )):
18
+ if (temp [i ] not in exclude ):
19
+ t += temp [i ]
20
+ t = t .strip ()
13
21
if t in invalid_keywords :
14
22
continue
15
23
if not ranks .has_key (t ):
16
24
ranks [t ] = 1
17
25
else :
18
- ranks [t ] += 1
26
+ ranks [t ] += 1
19
27
return ranks
20
28
21
29
def stripPunctuation (text ):
@@ -83,13 +91,18 @@ def processBody(self):
83
91
offset = 0
84
92
i = 0
85
93
l = []
86
- while True :
94
+ cont = True
95
+ while cont :
96
+ #this divides the text into sets of 500 words
97
+ #set j to the index of the last letter of the 500th word
87
98
j = self .findnth (self .text [i :],' ' ,500 )
88
- offset += j
99
+ #if only 500 words or less are left
89
100
if j == - 1 :
90
- break
91
- l .append (self .text [i :j ])
92
- i = offset + j + 1
101
+ cont = False
102
+ #Should append a string that contains 500 words for each loop(except the last loop) to l
103
+ #last loop should append a string with 500 words or less to l
104
+ l .append (self .text [i :i + j ])
105
+ i += j + 1
93
106
logger .debug ("processing with %i threads" % len (l ))
94
107
try :
95
108
if len (l ) == 0 :
@@ -136,4 +149,4 @@ def getDataDict(self):
136
149
for k ,v in self .keywords .items ():
137
150
if v < 3 :
138
151
del self .keywords [k ]
139
- return {"address" :self .url , "title" :self .title , "status" :self .status , "size" :self .size , "keywords" :self .keywords }
152
+ return {"address" :self .url , "title" :self .title , "status" :self .status , "size" :self .size , "keywords" :self .keywords }
0 commit comments