10 examples of 'split sentence into words python' in Python

Every line of 'split sentence into words python' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.

All examples are scanned by Snyk Code

By copying the Snyk Code Snippets you agree to
18def _sent_tokenize(paragraph, lang_code):
19 """Tokenize paragraph into sentences using a simple regex rule."""
20 if lang_code in ['zh', 'ja']:
21 index = 0
22 sentences = list()
23 for match in _SENTENCE_BORDER_REGEX_ZH.finditer(paragraph):
24 sentences.append(paragraph[index: match.end(0)])
25 index = match.end(0)
26
27 if index < len(paragraph):
28 sentences.append(paragraph[index:])
29 return sentences
30 else:
31 return _SENTENCE_BORDER_REGEX.split(paragraph)
103def split_into_sentences(self, text):
104 text = " " + text + " "
105 text = text.replace("\n", " ")
106 text = re.sub(prefixes, "\\1", text)
107 text = re.sub(websites, "\\1", text)
108 if "Ph.D" in text:
109 text = text.replace("Ph.D.", "PhD")
110 text = re.sub(r"\s" + alphabets + "[.] ", " \\1 ", text)
111 text = re.sub(acronyms+" "+starters, "\\1 \\2", text)
112 text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1\\2\\3", text)
113 text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1\\2", text)
114 text = re.sub(" "+suffixes+"[.] "+starters, " \\1 \\2", text)
115 text = re.sub(" "+suffixes+"[.]", " \\1", text)
116 text = re.sub(" " + alphabets + "[.]", " \\1", text)
117 if "”" in text:
118 text = text.replace(".”", "”.")
119 if "\"" in text:
120 text = text.replace(".\"", "\".")
121 if "!" in text:
122 text = text.replace("!\"", "\"!")
123 if "?" in text:
124 text = text.replace("?\"", "\"?")
125 text = text.replace(".", ".")
126 text = text.replace("?", "?")
127 text = text.replace("!", "!")
128 text = text.replace("", ".")
129 sentences = text.split("")
130 sentences = sentences[:-1]
131 sentences = [s.strip() for s in sentences]
132 return sentences
548def _sentence_to_tokens(self, sentence):
549 """Return the (ordered) list of tokens of the given sentence.
550
551 :param sentence (str)
552 :returns: list of str
553
554 """
555 # We are not using a vocabulary
556 if self._wordslist is None:
557 tokens = sentence.split()
558 else:
559 tokens = []
560 # We need to check if each token is in the vocabulary
561 for token in sentence.split():
562 if self._wordslist.is_in(token):
563 tokens.append(token)
564 else:
565 tokens.append(symbols.unk)
566
567 if tokens[0] != self._ss:
568 tokens.insert(0, self._ss)
569 if tokens[-1] != self._es:
570 tokens.append(self._es)
571 return tokens
16def split_words(data_str):
17 """
18 Takes a string, returns a list of pairs (word, 1),
19 one for each word in the input, so
20 [(w1, 1), (w2, 1), ..., (wn, 1)]
21 """
22 def _scan(str_data):
23 pattern = re.compile('[\W_]+')
24 return pattern.sub(' ', str_data).lower().split()
25
26 def _remove_stop_words(word_list):
27 with open('../stop_words.txt') as f:
28 stop_words = f.read().split(',')
29 stop_words.extend(list(string.ascii_lowercase))
30 return [w for w in word_list if not w in stop_words]
31
32 # The actual work of splitting the input into words
33 result = []
34 words = _remove_stop_words(_scan(data_str))
35 for w in words:
36 result.append((w, 1))
37 return result
19def tokenize_sentence_split(text, nlp):
20 tokenizer = nlp.tokenizer
21 for line in text.split("\n"):
22 tok_acc = []
23 for tok in tokenizer(line):
24 tok_acc.append(tok.text)
25 if tok.text in SENT_ENDS:
26 yield " ".join(tok_acc)
27 tok_acc = []
28 if tok_acc:
29 yield " ".join(tok_acc)
97@staticmethod
98def __splits(word):
99 return [(word[:i], word[i:])
100 for i in range(len(word) + 1)]
83def tokenize(phrase, delimiter='_'):
84 """ Tokenizes a phrase (converts those words to a unique token)
85 """
86
87 words = phrase.split(' ')
88 res = []
89
90 # remove the 's in text
91
92 for w in words:
93 w = w.split("'")[0]
94 res.append(w)
95
96 return delimiter.join(res)
28def tokenise(self):
29 words = self.pos_tagged.split()
30 cat = None
31 i = 1
32 while i < len(words):
33 if words[i] == '(GPE':
34 i += 1
35 cat = 'GPE'
36 word, tag = self.parse_token(words[i])
37 if word != '.':
38 self.words.append(Word(word, tag, cat))
39 i += 1
40
41 w = self.words[0].text
42 self.words[0].text = w[:1].lower() + w[1:]
309def __call__(self, text):
310 if not text.strip():
311 return
312
313 matches = self.re.finditer(text)
314 previous = 0
315 for match in matches:
316 start = match.start()
317 stop = match.end()
318 delimiter = match.group(1)
319 yield text[previous:start]
320 left = text[max(0, start - self.window):start]
321 right = text[stop:stop + self.window]
322 yield SentSplit(left, delimiter, right)
323 previous = stop
324 yield text[previous:]
75def tokenize(sent):
76 return [x.strip() for x in re.split('(\W+)', sent) if x.strip()]

Related snippets