Every line of 'split sentence into words python' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.
18 def _sent_tokenize(paragraph, lang_code): 19 """Tokenize paragraph into sentences using a simple regex rule.""" 20 if lang_code in ['zh', 'ja']: 21 index = 0 22 sentences = list() 23 for match in _SENTENCE_BORDER_REGEX_ZH.finditer(paragraph): 24 sentences.append(paragraph[index: match.end(0)]) 25 index = match.end(0) 26 27 if index < len(paragraph): 28 sentences.append(paragraph[index:]) 29 return sentences 30 else: 31 return _SENTENCE_BORDER_REGEX.split(paragraph)
103 def split_into_sentences(self, text): 104 text = " " + text + " " 105 text = text.replace("\n", " ") 106 text = re.sub(prefixes, "\\1", text) 107 text = re.sub(websites, "\\1", text) 108 if "Ph.D" in text: 109 text = text.replace("Ph.D.", "PhD") 110 text = re.sub(r"\s" + alphabets + "[.] ", " \\1 ", text) 111 text = re.sub(acronyms+" "+starters, "\\1 \\2", text) 112 text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1\\2\\3", text) 113 text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1\\2", text) 114 text = re.sub(" "+suffixes+"[.] "+starters, " \\1 \\2", text) 115 text = re.sub(" "+suffixes+"[.]", " \\1", text) 116 text = re.sub(" " + alphabets + "[.]", " \\1", text) 117 if "”" in text: 118 text = text.replace(".”", "”.") 119 if "\"" in text: 120 text = text.replace(".\"", "\".") 121 if "!" in text: 122 text = text.replace("!\"", "\"!") 123 if "?" in text: 124 text = text.replace("?\"", "\"?") 125 text = text.replace(".", ".") 126 text = text.replace("?", "?") 127 text = text.replace("!", "!") 128 text = text.replace("", ".") 129 sentences = text.split("") 130 sentences = sentences[:-1] 131 sentences = [s.strip() for s in sentences] 132 return sentences
548 def _sentence_to_tokens(self, sentence): 549 """Return the (ordered) list of tokens of the given sentence. 550 551 :param sentence (str) 552 :returns: list of str 553 554 """ 555 # We are not using a vocabulary 556 if self._wordslist is None: 557 tokens = sentence.split() 558 else: 559 tokens = [] 560 # We need to check if each token is in the vocabulary 561 for token in sentence.split(): 562 if self._wordslist.is_in(token): 563 tokens.append(token) 564 else: 565 tokens.append(symbols.unk) 566 567 if tokens[0] != self._ss: 568 tokens.insert(0, self._ss) 569 if tokens[-1] != self._es: 570 tokens.append(self._es) 571 return tokens
16 def split_words(data_str): 17 """ 18 Takes a string, returns a list of pairs (word, 1), 19 one for each word in the input, so 20 [(w1, 1), (w2, 1), ..., (wn, 1)] 21 """ 22 def _scan(str_data): 23 pattern = re.compile('[\W_]+') 24 return pattern.sub(' ', str_data).lower().split() 25 26 def _remove_stop_words(word_list): 27 with open('../stop_words.txt') as f: 28 stop_words = f.read().split(',') 29 stop_words.extend(list(string.ascii_lowercase)) 30 return [w for w in word_list if not w in stop_words] 31 32 # The actual work of splitting the input into words 33 result = [] 34 words = _remove_stop_words(_scan(data_str)) 35 for w in words: 36 result.append((w, 1)) 37 return result
19 def tokenize_sentence_split(text, nlp): 20 tokenizer = nlp.tokenizer 21 for line in text.split("\n"): 22 tok_acc = [] 23 for tok in tokenizer(line): 24 tok_acc.append(tok.text) 25 if tok.text in SENT_ENDS: 26 yield " ".join(tok_acc) 27 tok_acc = [] 28 if tok_acc: 29 yield " ".join(tok_acc)
97 @staticmethod 98 def __splits(word): 99 return [(word[:i], word[i:]) 100 for i in range(len(word) + 1)]
83 def tokenize(phrase, delimiter='_'): 84 """ Tokenizes a phrase (converts those words to a unique token) 85 """ 86 87 words = phrase.split(' ') 88 res = [] 89 90 # remove the 's in text 91 92 for w in words: 93 w = w.split("'")[0] 94 res.append(w) 95 96 return delimiter.join(res)
28 def tokenise(self): 29 words = self.pos_tagged.split() 30 cat = None 31 i = 1 32 while i < len(words): 33 if words[i] == '(GPE': 34 i += 1 35 cat = 'GPE' 36 word, tag = self.parse_token(words[i]) 37 if word != '.': 38 self.words.append(Word(word, tag, cat)) 39 i += 1 40 41 w = self.words[0].text 42 self.words[0].text = w[:1].lower() + w[1:]
309 def __call__(self, text): 310 if not text.strip(): 311 return 312 313 matches = self.re.finditer(text) 314 previous = 0 315 for match in matches: 316 start = match.start() 317 stop = match.end() 318 delimiter = match.group(1) 319 yield text[previous:start] 320 left = text[max(0, start - self.window):start] 321 right = text[stop:stop + self.window] 322 yield SentSplit(left, delimiter, right) 323 previous = stop 324 yield text[previous:]
75 def tokenize(sent): 76 return [x.strip() for x in re.split('(\W+)', sent) if x.strip()]