Every line of 'train test split sklearn' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.
33 def train_test_split_result(clf, X, y): 34 print("This is Random and Percentaged Spilt Result ... ") 35 X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y) 36 clf = clf.fit(X_train, y_train) 37 report_result(clf, X_test, y_test, y_train)
82 def my_train_split(ds, y): 83 return ds, skorch.dataset.Dataset(corpus.valid[:200], y=None)
9 def train_test_split(fileName,type=1): 10 header = ['user_id', 'item_id', 'rating', 'timestamp'] 11 if(type==1): 12 df = pd.read_csv(fileName, sep='\t', names=header) 13 else: 14 df = pd.read_csv(fileName, sep='::', names=header,engine = 'python') 15 n_users = df.user_id.unique().shape[0] 16 users = df.user_id.max() 17 n_items = df.item_id.unique().shape[0] 18 items = df.item_id.max() 19 20 print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items) 21 print 'The biggest ID of users = ' + str(users) + ' | The biggest ID of movies = ' + str(items) 22 # 23 train_data, test_data = cv.train_test_split(df, test_size=0.1) 24 train_data = pd.DataFrame(train_data) 25 test_data = pd.DataFrame(test_data) 26 #Create two user-item matrices, one for training and another for testing 27 train_data_matrix = np.zeros((users, items)) 28 for line in train_data.itertuples(): 29 train_data_matrix[line[1]-1, line[2]-1] = line[3] 30 31 test_data_matrix = np.zeros((users, items)) 32 for line in test_data.itertuples(): 33 test_data_matrix[line[1]-1, line[2]-1] = line[3] 34 return train_data_matrix,test_data_matrix
134 def test_split(self): 135 """ 136 Apply split to the sample described in the docstring of prepare_time_inhomogeneous_cv_object, with n_splits = 4 137 and n_test_splits = 2. The folds are [0 : 6], [6 : 11], [11 : 16], [16 : 21]. We use an embargo of zero. 138 Inspection shows that the pairs test-train sets should respectively be 139 [...] 140 3. Train: folds 1 and 4, samples [0, 1, 2, 3, 4, 16, 17, 18, 19, 20]. Test: folds 2 and 3, samples [6, 7, 8, 9, 141 10, 11, 12, 13, 14, 15]. Sample 5 is purged from the train set. 142 4. Train: folds 2 and 3, samples [7, 8, 9, 10, 11, 12, 13, 14, 15]. Test: folds 1 and 4, samples [0, 1, 2, 3, 4, 143 5, 16, 17, 18, 19, 20]. Sample 6 is embargoed. 144 [...] 145 """ 146 cv = CombPurgedKFoldCV(n_splits=4, n_test_splits=2) 147 prepare_time_inhomogeneous_cv_object(cv) 148 count = 0 149 for train_set, test_set in cv.split(cv.X, pred_times=cv.pred_times, eval_times=cv.eval_times): 150 count += 1 151 if count == 3: 152 result_train = np.array([0, 1, 2, 3, 4, 16, 17, 18, 19, 20]) 153 result_test = np.array([6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) 154 self.assertTrue(np.array_equal(result_train, train_set)) 155 self.assertTrue(np.array_equal(result_test, test_set)) 156 if count == 4: 157 result_train = np.array([7, 8, 9, 10, 11, 12, 13, 14, 15]) 158 result_test = np.array([0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20]) 159 self.assertTrue(np.array_equal(result_train, train_set)) 160 self.assertTrue(np.array_equal(result_test, test_set))
9 def train_test_split(X, y, train_percentage=0.8): 10 ''' 11 Very simple splitting into train and test data. Works for 12 any input shape without dependencies, but is a bit restricted. 13 ''' 14 cut_idx = int(floor(X.shape[0] * 0.80)) 15 X_train, X_test = X[:cut_idx], X[cut_idx:] 16 y_train, y_test = y[:cut_idx], y[cut_idx:] 17 print("Number of train samples", X_train.shape[0]) 18 print("Number of test samples", X_test.shape[0]) 19 20 return (X_train, y_train), (X_test, y_test)
158 def train_test(self, train_path, test_path=None): 159 # load train and (maybe) test data 160 metadata = MetaData(label_column=self.label_column, 161 train_path=train_path, 162 test_path=test_path) 163 self.num_classes = metadata.k_classes 164 self.num_features = metadata.d_features 165 166 # if necessary, cast judgment metric into its binary/multiary equivalent 167 if self.num_classes == 2: 168 if self.judgment_metric in [Metrics.F1_MICRO, Metrics.F1_MACRO]: 169 self.judgment_metric = Metrics.F1 170 elif self.judgment_metric in [Metrics.ROC_AUC_MICRO, 171 Metrics.ROC_AUC_MACRO]: 172 self.judgment_metric = Metrics.ROC_AUC 173 else: 174 if self.judgment_metric == Metrics.F1: 175 self.judgment_metric = Metrics.F1_MACRO 176 elif self.judgment_metric == Metrics.ROC_AUC: 177 self.judgment_metric = Metrics.ROC_AUC_MACRO 178 179 # load training data 180 train_data = self.load_data(train_path) 181 182 # if necessary, generate permanent train/test split 183 if test_path is not None: 184 test_data = self.load_data(test_path) 185 else: 186 train_data, test_data = train_test_split(train_data, 187 test_size=self.testing_ratio, 188 random_state=self.random_state) 189 190 # extract feature matrix and labels from raw data 191 self.encoder = DataEncoder(label_column=self.label_column) 192 X_train, y_train = self.encoder.fit_transform(train_data) 193 X_test, y_test = self.encoder.transform(test_data) 194 195 # create and cross-validate pipeline 196 self.make_pipeline() 197 cv_scores = self.cross_validate(X_train, y_train) 198 199 # train and test the final model 200 self.pipeline.fit(X_train, y_train) 201 test_scores = self.test_final_model(X_test, y_test) 202 return {'cv': cv_scores, 'test': test_scores}
23 def train_valid_test_split(SOURCE_DATA_DIR, TARGET_DATA_DIR, train_size=0.8, valid_size=0.1, 24 COMBINE_FOLDERS=None, SELECT_FOLDERS=None): 25 """ 26 Usage: 27 SOURCE_DATA_DIR = "data/ClothingAttributeDataset/images/" 28 TARGET_DATA_DIR = "data/" 29 30 train_valid_test_split(SOURCE_DATA_DIR, TARGET_DATA_DIR) 31 """ 32 if COMBINE_FOLDERS is None: 33 COMBINE_FOLDERS = dict() 34 for folder_name in ["train", "test", "valid"]: 35 rmtree(os.path.join(TARGET_DATA_DIR, folder_name), ignore_errors=True) 36 os.makedirs(os.path.join(TARGET_DATA_DIR, folder_name)) 37 38 # Split records by 80-20 between Train and Validation Set 39 filenames = np.random.permutation(glob(os.path.join(SOURCE_DATA_DIR, "*.jpg"))) 40 41 train_idx = int(len(filenames) * train_size) 42 test_idx = int(len(filenames) * (train_size+valid_size)) 43 for idx, filename in enumerate(filenames): 44 target_name = filename.split("/")[-1] 45 if idx < train_idx: 46 target_filepath = os.path.join(TARGET_DATA_DIR, "train", target_name) 47 elif idx < test_idx: 48 target_filepath = os.path.join(TARGET_DATA_DIR, "valid", target_name) 49 else: 50 target_filepath = os.path.join(TARGET_DATA_DIR, "test", target_name) 51 copyfile(filenames[idx], target_filepath)
167 @staticmethod 168 def _get_split(X, y): 169 split = ShuffleSplit(y.shape[0], n_iter=1) 170 train, validate = list(split)[0] 171 X_train, X_validate, y_train, y_validate = X[train], X[validate], y[train], y[validate] 172 return X_train, X_validate, y_train, y_validate
24 def split_train_evaluate(self, X, Y, train_precent, seed=0): 25 state = np.random.get_state() 26 training_size = int(train_precent * len(X)) 27 shuffle_indices = np.random.permutation(np.arange(len(X))) 28 X_train = [X[shuffle_indices[i]] for i in range(training_size)] 29 Y_train = [Y[shuffle_indices[i]] for i in range(training_size)] 30 X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))] 31 Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))] 32 33 self.train(X_train, Y_train, Y) 34 np.random.set_state(state) 35 return self.evaluate(X_test, Y_test)
423 def _train_val_split(df, validation): 424 train_df = df 425 val_df = None 426 validation_ratio = 0.0 427 428 if isinstance(validation, float) and validation > 0: 429 train_df, val_df = train_df.randomSplit([1.0 - validation, validation]) 430 validation_ratio = validation 431 elif isinstance(validation, str): 432 dtype = [field.dataType for field in df.schema.fields if field.name == validation][0] 433 bool_dtype = isinstance(dtype, BooleanType) 434 val_df = train_df.filter( 435 f.col(validation) if bool_dtype else f.col(validation) > 0).drop(validation) 436 train_df = train_df.filter( 437 ~f.col(validation) if bool_dtype else f.col(validation) == 0).drop(validation) 438 439 # Approximate ratio of validation data to training data for proportionate scale 440 # of partitions 441 timeout_ms = 1000 442 confidence = 0.90 443 train_rows = train_df.rdd.countApprox(timeout=timeout_ms, confidence=confidence) 444 val_rows = val_df.rdd.countApprox(timeout=timeout_ms, confidence=confidence) 445 validation_ratio = val_rows / (val_rows + train_rows) 446 elif validation: 447 raise ValueError('Unrecognized validation type: {}'.format(type(validation))) 448 449 return train_df, val_df, validation_ratio