Every line of 'train_test_split sklearn' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.
33 def train_test_split_result(clf, X, y): 34 print("This is Random and Percentaged Spilt Result ... ") 35 X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y) 36 clf = clf.fit(X_train, y_train) 37 report_result(clf, X_test, y_test, y_train)
82 def my_train_split(ds, y): 83 return ds, skorch.dataset.Dataset(corpus.valid[:200], y=None)
9 def train_test_split(fileName,type=1): 10 header = ['user_id', 'item_id', 'rating', 'timestamp'] 11 if(type==1): 12 df = pd.read_csv(fileName, sep='\t', names=header) 13 else: 14 df = pd.read_csv(fileName, sep='::', names=header,engine = 'python') 15 n_users = df.user_id.unique().shape[0] 16 users = df.user_id.max() 17 n_items = df.item_id.unique().shape[0] 18 items = df.item_id.max() 19 20 print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items) 21 print 'The biggest ID of users = ' + str(users) + ' | The biggest ID of movies = ' + str(items) 22 # 23 train_data, test_data = cv.train_test_split(df, test_size=0.1) 24 train_data = pd.DataFrame(train_data) 25 test_data = pd.DataFrame(test_data) 26 #Create two user-item matrices, one for training and another for testing 27 train_data_matrix = np.zeros((users, items)) 28 for line in train_data.itertuples(): 29 train_data_matrix[line[1]-1, line[2]-1] = line[3] 30 31 test_data_matrix = np.zeros((users, items)) 32 for line in test_data.itertuples(): 33 test_data_matrix[line[1]-1, line[2]-1] = line[3] 34 return train_data_matrix,test_data_matrix
134 def test_split(self): 135 """ 136 Apply split to the sample described in the docstring of prepare_time_inhomogeneous_cv_object, with n_splits = 4 137 and n_test_splits = 2. The folds are [0 : 6], [6 : 11], [11 : 16], [16 : 21]. We use an embargo of zero. 138 Inspection shows that the pairs test-train sets should respectively be 139 [...] 140 3. Train: folds 1 and 4, samples [0, 1, 2, 3, 4, 16, 17, 18, 19, 20]. Test: folds 2 and 3, samples [6, 7, 8, 9, 141 10, 11, 12, 13, 14, 15]. Sample 5 is purged from the train set. 142 4. Train: folds 2 and 3, samples [7, 8, 9, 10, 11, 12, 13, 14, 15]. Test: folds 1 and 4, samples [0, 1, 2, 3, 4, 143 5, 16, 17, 18, 19, 20]. Sample 6 is embargoed. 144 [...] 145 """ 146 cv = CombPurgedKFoldCV(n_splits=4, n_test_splits=2) 147 prepare_time_inhomogeneous_cv_object(cv) 148 count = 0 149 for train_set, test_set in cv.split(cv.X, pred_times=cv.pred_times, eval_times=cv.eval_times): 150 count += 1 151 if count == 3: 152 result_train = np.array([0, 1, 2, 3, 4, 16, 17, 18, 19, 20]) 153 result_test = np.array([6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) 154 self.assertTrue(np.array_equal(result_train, train_set)) 155 self.assertTrue(np.array_equal(result_test, test_set)) 156 if count == 4: 157 result_train = np.array([7, 8, 9, 10, 11, 12, 13, 14, 15]) 158 result_test = np.array([0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20]) 159 self.assertTrue(np.array_equal(result_train, train_set)) 160 self.assertTrue(np.array_equal(result_test, test_set))
9 def train_test_split(X, y, train_percentage=0.8): 10 ''' 11 Very simple splitting into train and test data. Works for 12 any input shape without dependencies, but is a bit restricted. 13 ''' 14 cut_idx = int(floor(X.shape[0] * 0.80)) 15 X_train, X_test = X[:cut_idx], X[cut_idx:] 16 y_train, y_test = y[:cut_idx], y[cut_idx:] 17 print("Number of train samples", X_train.shape[0]) 18 print("Number of test samples", X_test.shape[0]) 19 20 return (X_train, y_train), (X_test, y_test)
158 def train_test(self, train_path, test_path=None): 159 # load train and (maybe) test data 160 metadata = MetaData(label_column=self.label_column, 161 train_path=train_path, 162 test_path=test_path) 163 self.num_classes = metadata.k_classes 164 self.num_features = metadata.d_features 165 166 # if necessary, cast judgment metric into its binary/multiary equivalent 167 if self.num_classes == 2: 168 if self.judgment_metric in [Metrics.F1_MICRO, Metrics.F1_MACRO]: 169 self.judgment_metric = Metrics.F1 170 elif self.judgment_metric in [Metrics.ROC_AUC_MICRO, 171 Metrics.ROC_AUC_MACRO]: 172 self.judgment_metric = Metrics.ROC_AUC 173 else: 174 if self.judgment_metric == Metrics.F1: 175 self.judgment_metric = Metrics.F1_MACRO 176 elif self.judgment_metric == Metrics.ROC_AUC: 177 self.judgment_metric = Metrics.ROC_AUC_MACRO 178 179 # load training data 180 train_data = self.load_data(train_path) 181 182 # if necessary, generate permanent train/test split 183 if test_path is not None: 184 test_data = self.load_data(test_path) 185 else: 186 train_data, test_data = train_test_split(train_data, 187 test_size=self.testing_ratio, 188 random_state=self.random_state) 189 190 # extract feature matrix and labels from raw data 191 self.encoder = DataEncoder(label_column=self.label_column) 192 X_train, y_train = self.encoder.fit_transform(train_data) 193 X_test, y_test = self.encoder.transform(test_data) 194 195 # create and cross-validate pipeline 196 self.make_pipeline() 197 cv_scores = self.cross_validate(X_train, y_train) 198 199 # train and test the final model 200 self.pipeline.fit(X_train, y_train) 201 test_scores = self.test_final_model(X_test, y_test) 202 return {'cv': cv_scores, 'test': test_scores}
23 def train_valid_test_split(SOURCE_DATA_DIR, TARGET_DATA_DIR, train_size=0.8, valid_size=0.1, 24 COMBINE_FOLDERS=None, SELECT_FOLDERS=None): 25 """ 26 Usage: 27 SOURCE_DATA_DIR = "data/ClothingAttributeDataset/images/" 28 TARGET_DATA_DIR = "data/" 29 30 train_valid_test_split(SOURCE_DATA_DIR, TARGET_DATA_DIR) 31 """ 32 if COMBINE_FOLDERS is None: 33 COMBINE_FOLDERS = dict() 34 for folder_name in ["train", "test", "valid"]: 35 rmtree(os.path.join(TARGET_DATA_DIR, folder_name), ignore_errors=True) 36 os.makedirs(os.path.join(TARGET_DATA_DIR, folder_name)) 37 38 # Split records by 80-20 between Train and Validation Set 39 filenames = np.random.permutation(glob(os.path.join(SOURCE_DATA_DIR, "*.jpg"))) 40 41 train_idx = int(len(filenames) * train_size) 42 test_idx = int(len(filenames) * (train_size+valid_size)) 43 for idx, filename in enumerate(filenames): 44 target_name = filename.split("/")[-1] 45 if idx < train_idx: 46 target_filepath = os.path.join(TARGET_DATA_DIR, "train", target_name) 47 elif idx < test_idx: 48 target_filepath = os.path.join(TARGET_DATA_DIR, "valid", target_name) 49 else: 50 target_filepath = os.path.join(TARGET_DATA_DIR, "test", target_name) 51 copyfile(filenames[idx], target_filepath)
167 @staticmethod 168 def _get_split(X, y): 169 split = ShuffleSplit(y.shape[0], n_iter=1) 170 train, validate = list(split)[0] 171 X_train, X_validate, y_train, y_validate = X[train], X[validate], y[train], y[validate] 172 return X_train, X_validate, y_train, y_validate
24 def split_train_evaluate(self, X, Y, train_precent, seed=0): 25 state = np.random.get_state() 26 training_size = int(train_precent * len(X)) 27 shuffle_indices = np.random.permutation(np.arange(len(X))) 28 X_train = [X[shuffle_indices[i]] for i in range(training_size)] 29 Y_train = [Y[shuffle_indices[i]] for i in range(training_size)] 30 X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))] 31 Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))] 32 33 self.train(X_train, Y_train, Y) 34 np.random.set_state(state) 35 return self.evaluate(X_test, Y_test)
423 def _train_val_split(df, validation): 424 train_df = df 425 val_df = None 426 validation_ratio = 0.0 427 428 if isinstance(validation, float) and validation > 0: 429 train_df, val_df = train_df.randomSplit([1.0 - validation, validation]) 430 validation_ratio = validation 431 elif isinstance(validation, str): 432 dtype = [field.dataType for field in df.schema.fields if field.name == validation][0] 433 bool_dtype = isinstance(dtype, BooleanType) 434 val_df = train_df.filter( 435 f.col(validation) if bool_dtype else f.col(validation) > 0).drop(validation) 436 train_df = train_df.filter( 437 ~f.col(validation) if bool_dtype else f.col(validation) == 0).drop(validation) 438 439 # Approximate ratio of validation data to training data for proportionate scale 440 # of partitions 441 timeout_ms = 1000 442 confidence = 0.90 443 train_rows = train_df.rdd.countApprox(timeout=timeout_ms, confidence=confidence) 444 val_rows = val_df.rdd.countApprox(timeout=timeout_ms, confidence=confidence) 445 validation_ratio = val_rows / (val_rows + train_rows) 446 elif validation: 447 raise ValueError('Unrecognized validation type: {}'.format(type(validation))) 448 449 return train_df, val_df, validation_ratio