10 examples of 'nltk word_tokenize' in Python

Every line of 'nltk word_tokenize' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.

All examples are scanned by Snyk Code

By copying the Snyk Code Snippets you agree to
13def tokenize(text):
14 """
15 Split a text into tokens (words, morphemes we can separate such as
16 "n't", and punctuation).
17 """
18 return list(_tokenize_gen(text))
74@requires_nltk_corpus
75def sent_tokenize(self, text, **kwargs):
76 """NLTK's sentence tokenizer (currently PunktSentenceTokenizer).
77
78 Uses an unsupervised algorithm to build a model for abbreviation
79 words, collocations, and words that start sentences, then uses
80 that to find sentence boundaries.
81
82 """
83 sentences = self.sent_tok.tokenize(
84 text,
85 realign_boundaries=kwargs.get(
86 "realign_boundaries",
87 True))
88 return sentences
186def word_tokenize(text):
187 """Takes a string and returns a list of strings. Intended use: the
188 input string is English text and the output consists of the
189 lower-case words in this text with numbers and punctuation, except
190 for hyphens, removed.
191
192 The core work is done by NLTK's Treebank Word Tokenizer.
193
194 :param text: Text to be tokeized.
195 :type text: string
196
197 :returns: tokens : list of strings
198 """
199 global word_tokenizer
200 if word_tokenizer is None:
201 import nltk
202 word_tokenizer = nltk.TreebankWordTokenizer()
203
204 text = rehyph(text)
205 text = process_word(text)
206 if isinstance(text, str):
207 text = text.replace(u'\x00', '')
208 elif isinstance(text, basestring):
209 pass
210 text = text.lower()
211 tokens = word_tokenizer.tokenize(text)
212
213 #process_word = lambda x: strip_punc_word(rem_num_word(word)).lower().replace(u'\x00','')
214 #tokens = [process_word(word) for word in text]
215
216 return tokens
33def tokenize(self, document):
34 # Break the document into sentences
35 for sent in sent_tokenize(document):
36 # Break the sentence into part of speech tagged tokens
37 for token, tag in pos_tag(wordpunct_tokenize(sent)):
38 # Apply preprocessing to the token
39 token = token.lower() if self.lower else token
40 token = token.strip() if self.strip else token
41 token = token.strip('_') if self.strip else token
42 token = token.strip('*') if self.strip else token
43
44 # If stopword, ignore token and continue
45 if token in self.stopwords:
46 continue
47
48 # If punctuation, ignore token and continue
49 if all(char in self.punct for char in token):
50 continue
51
52 # Lemmatize the token and yield
53 lemma = self.lemmatize(token, tag)
54 yield lemma
541@staticmethod
542def tokenize(text):
543 return [word for word in word_tokenize(text.lower()) if word not in stopwords.words('english')]
466def nltk_tokenize(self, text, building=False):
467 """
468 Tokenize using NLTK PunktTokenizer.
469
470 Uses nltk-trained PunktTokenizer for sentence tokenization and Treebank Word
471 Tokenizer for tokenizing words within sentences.
472 """
473 return (
474 token
475 for sent in self.sent_tok.tokenize(text)
476 for token in self.word_tok.tokenize(sent)
477 )
110def tokenize(self, text, unk=False):
111 split_tokens = []
112 for token in self.basic_tokenizer.tokenize(text):
113 for sub_token in self.wordpiece_tokenizer.tokenize(token, unk=unk):
114 split_tokens.append(sub_token)
115
116 return split_tokens
145def tokenize(text):
146 import re
147 tokens = word_tokenize(text)
148 output = [i for i in tokens if i not in string.punctuation and not re.match("^[0-9.].*$", i) and len(i) > 2]
149 output = stem_tokens(output, stemmer)
150 return output
19def word_tokenize(tokens):
20 return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
58def word_tokenize(text):
59 return words_pattern.findall(text)

Related snippets