Every line of 'nltk word_tokenize' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.
13 def tokenize(text): 14 """ 15 Split a text into tokens (words, morphemes we can separate such as 16 "n't", and punctuation). 17 """ 18 return list(_tokenize_gen(text))
74 @requires_nltk_corpus 75 def sent_tokenize(self, text, **kwargs): 76 """NLTK's sentence tokenizer (currently PunktSentenceTokenizer). 77 78 Uses an unsupervised algorithm to build a model for abbreviation 79 words, collocations, and words that start sentences, then uses 80 that to find sentence boundaries. 81 82 """ 83 sentences = self.sent_tok.tokenize( 84 text, 85 realign_boundaries=kwargs.get( 86 "realign_boundaries", 87 True)) 88 return sentences
186 def word_tokenize(text): 187 """Takes a string and returns a list of strings. Intended use: the 188 input string is English text and the output consists of the 189 lower-case words in this text with numbers and punctuation, except 190 for hyphens, removed. 191 192 The core work is done by NLTK's Treebank Word Tokenizer. 193 194 :param text: Text to be tokeized. 195 :type text: string 196 197 :returns: tokens : list of strings 198 """ 199 global word_tokenizer 200 if word_tokenizer is None: 201 import nltk 202 word_tokenizer = nltk.TreebankWordTokenizer() 203 204 text = rehyph(text) 205 text = process_word(text) 206 if isinstance(text, str): 207 text = text.replace(u'\x00', '') 208 elif isinstance(text, basestring): 209 pass 210 text = text.lower() 211 tokens = word_tokenizer.tokenize(text) 212 213 #process_word = lambda x: strip_punc_word(rem_num_word(word)).lower().replace(u'\x00','') 214 #tokens = [process_word(word) for word in text] 215 216 return tokens
33 def tokenize(self, document): 34 # Break the document into sentences 35 for sent in sent_tokenize(document): 36 # Break the sentence into part of speech tagged tokens 37 for token, tag in pos_tag(wordpunct_tokenize(sent)): 38 # Apply preprocessing to the token 39 token = token.lower() if self.lower else token 40 token = token.strip() if self.strip else token 41 token = token.strip('_') if self.strip else token 42 token = token.strip('*') if self.strip else token 43 44 # If stopword, ignore token and continue 45 if token in self.stopwords: 46 continue 47 48 # If punctuation, ignore token and continue 49 if all(char in self.punct for char in token): 50 continue 51 52 # Lemmatize the token and yield 53 lemma = self.lemmatize(token, tag) 54 yield lemma
541 @staticmethod 542 def tokenize(text): 543 return [word for word in word_tokenize(text.lower()) if word not in stopwords.words('english')]
466 def nltk_tokenize(self, text, building=False): 467 """ 468 Tokenize using NLTK PunktTokenizer. 469 470 Uses nltk-trained PunktTokenizer for sentence tokenization and Treebank Word 471 Tokenizer for tokenizing words within sentences. 472 """ 473 return ( 474 token 475 for sent in self.sent_tok.tokenize(text) 476 for token in self.word_tok.tokenize(sent) 477 )
110 def tokenize(self, text, unk=False): 111 split_tokens = [] 112 for token in self.basic_tokenizer.tokenize(text): 113 for sub_token in self.wordpiece_tokenizer.tokenize(token, unk=unk): 114 split_tokens.append(sub_token) 115 116 return split_tokens
145 def tokenize(text): 146 import re 147 tokens = word_tokenize(text) 148 output = [i for i in tokens if i not in string.punctuation and not re.match("^[0-9.].*$", i) and len(i) > 2] 149 output = stem_tokens(output, stemmer) 150 return output
19 def word_tokenize(tokens): 20 return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
58 def word_tokenize(text): 59 return words_pattern.findall(text)