Every line of 'word_tokenize python' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.
58 def word_tokenize(text): 59 return words_pattern.findall(text)
9 def tokenize(self, text: str) -> List[str]: 10 return list(text)
10 def _tokenize(self, text): 11 raise NotImplementedError
13 def tokenize(st): 14 return " ".join([str(x) for x in tokenizer(st)])
110 def tokenize(self, text, unk=False): 111 split_tokens = [] 112 for token in self.basic_tokenizer.tokenize(text): 113 for sub_token in self.wordpiece_tokenizer.tokenize(token, unk=unk): 114 split_tokens.append(sub_token) 115 116 return split_tokens
97 def tokenize(self, line: str, lower: bool=False, is_json: bool=False, print_it: bool=False, remove_wordpiece_indicator=False, return_ids=False) -> Optional[str]: 98 if not line.isspace(): 99 if is_json: 100 orig_json = json.loads(line) 101 line = orig_json['text'] 102 else: 103 orig_json = None 104 line = line.strip() 105 tokens = self._tokenize_line(line, lower, remove_wordpiece_indicator, return_ids) 106 if orig_json: 107 orig_json['text'] = tokens 108 tokens = json.dumps(orig_json) 109 if print_it: 110 print(tokens) 111 return tokens 112 else: 113 return None
18 def _sent_tokenize(paragraph, lang_code): 19 """Tokenize paragraph into sentences using a simple regex rule.""" 20 if lang_code in ['zh', 'ja']: 21 index = 0 22 sentences = list() 23 for match in _SENTENCE_BORDER_REGEX_ZH.finditer(paragraph): 24 sentences.append(paragraph[index: match.end(0)]) 25 index = match.end(0) 26 27 if index < len(paragraph): 28 sentences.append(paragraph[index:]) 29 return sentences 30 else: 31 return _SENTENCE_BORDER_REGEX.split(paragraph)
83 def tokenize(phrase, delimiter='_'): 84 """ Tokenizes a phrase (converts those words to a unique token) 85 """ 86 87 words = phrase.split(' ') 88 res = [] 89 90 # remove the 's in text 91 92 for w in words: 93 w = w.split("'")[0] 94 res.append(w) 95 96 return delimiter.join(res)
46 def _tokenize(self, text): 47 R = [] 48 for c in text: 49 if c in self._token_dict: 50 R.append(c) 51 elif self._is_space(c): 52 R.append('[unused1]') 53 else: 54 R.append('[UNK]') 55 return R
111 def test_colon_time(self): 112 """Test the word tokenizer on colon between digits in a time.""" 113 self.assertEqual( 114 ['He', 'arrived', 'at', '3:00', 'pm', '.'], 115 self.t.tokenize('He arrived at 3:00 pm.') 116 )