Every line of 'nltk stopwords' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.
5 def stopwords_nltk(): 6 7 return stopwords.words('english')
73 def remove_stopwords(tokens, language): 74 """ 75 去除中文结束词 76 """ 77 from .stopwords import stopwords 78 79 # 删除结束词 80 tokens = set(tokens) - set(stopwords) 81 82 return tokens
433 def stop_words_stem(self, stop_words=None): 434 if stop_words is not None: 435 stop_words_ = stop_words 436 else: 437 stop_words_ = self.stop_words 438 return list(set([stem(word) for word in stop_words_]))
53 def remove_stopwords(tokens, stopwords=STOPWORDS): 54 """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`. 55 56 Parameters 57 ---------- 58 tokens : iterable of str 59 Sequence of tokens. 60 stopwords : iterable of str, optional 61 Sequence of stopwords 62 63 Returns 64 ------- 65 list of str 66 List of tokens without `stopwords`. 67 68 """ 69 return [token for token in tokens if token not in stopwords]
231 def _remove_stop_words(phrase): 232 while len(phrase) > 0 and (phrase[0].is_stop 233 or str(phrase[0]).strip().lower() in Stopwords.get_words()): 234 phrase = phrase[1:] 235 while len(phrase) > 0 and (phrase[-1].is_stop 236 or str(phrase[-1]).strip().lower() in Stopwords.get_words()): 237 phrase = phrase[:-1] 238 return phrase
541 @staticmethod 542 def tokenize(text): 543 return [word for word in word_tokenize(text.lower()) if word not in stopwords.words('english')]
35 @staticmethod 36 def removeStopWords(words): 37 file = open("stopwords.dic") 38 stopwords = [] 39 for line in file.readlines(): 40 line = line.strip(); 41 if len(line) > 0: 42 stopwords.append(line) 43 file.close() 44 rwords = [] 45 for word in words: 46 flag = True 47 for stopword in stopwords: 48 #if word.encode('utf-8') == stopword.encode('utf-8'): 49 if word == stopword: 50 flag = False 51 break 52 if flag and len(word.strip()) > 0: 53 rwords.append(word) 54 return rwords
202 @property 203 def cleanTokens(self, removeStopwords=True): 204 clean = [token for token in self.tokens if token not in punctuation] 205 clean = [token.lower() for token in clean] 206 clean = [token for token in clean if token.isalpha()] 207 if removeStopwords: 208 clean = self.removeStopwords(clean) 209 return clean
6 def stopwords_filter(string): 7 8 text = string 9 # strip tashkeel because the stop words list contains voweled words 10 text = araby.strip_tashkeel(text) 11 word_tokenizer = WordTokenizer("arabic") 12 tokens = word_tokenizer.tokenize(text) 13 14 # filter stop words 15 no_stops = [w for w in tokens if w not in ARABIC_STOPS] 16 17 return no_stops
9 def _load_stopwords(): 10 with open('./files/stopword.txt', 'r', encoding='utf-8') as fr: 11 lines = [line.strip('\r\n ') for line in fr] 12 return lines