10 examples of 'split data into train and test in python' in Python

Every line of 'split data into train and test in python' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.

All examples are scanned by Snyk Code

By copying the Snyk Code Snippets you agree to
36def prepare_data():
37 data = io.loadmat(raw_filename)
38 df = pd.DataFrame(dict(
39 spectra=data['Int_ABQR'].tolist(),
40 solute=data['Gamme_ABQR'].ravel(),
41 vial=data['Vial_ABQR'].ravel(),
42 concentration=data['Conc_ABQR'].ravel(),
43 molecule=data['Molecule_ABQR'].ravel()))
44 skf = ShuffleSplit(n_splits=2, test_size=held_out_test_size,
45 random_state=random_state)
46 train_is, test_is = list(skf.split(df))[0]
47 df_train = df.iloc[train_is]
48 df_test = df.iloc[test_is]
49 df_train.to_csv(train_filename, index=False)
50 df_test.to_csv(test_filename, index=False)
17def split_data(data, percent_train=0.80):
18 num_rows = len(data)
19 train_data, test_data = [], []
20 for idx, row in enumerate(data):
21 if idx < num_rows * percent_train:
22 train_data.append(row)
23 else:
24 test_data.append(row)
25 return train_data, test_data
82def my_train_split(ds, y):
83 return ds, skorch.dataset.Dataset(corpus.valid[:200], y=None)
35def split_data(df):
36 X = df.drop('Y', axis=1).values
37 y = df['Y'].values
38
39 X_train, X_test, y_train, y_test = train_test_split(
40 X, y, test_size=0.2, random_state=0)
41 data = {"train": {"X": X_train, "y": y_train},
42 "test": {"X": X_test, "y": y_test}}
43 return data
23def train_valid_test_split(SOURCE_DATA_DIR, TARGET_DATA_DIR, train_size=0.8, valid_size=0.1,
24 COMBINE_FOLDERS=None, SELECT_FOLDERS=None):
25 """
26 Usage:
27 SOURCE_DATA_DIR = "data/ClothingAttributeDataset/images/"
28 TARGET_DATA_DIR = "data/"
29
30 train_valid_test_split(SOURCE_DATA_DIR, TARGET_DATA_DIR)
31 """
32 if COMBINE_FOLDERS is None:
33 COMBINE_FOLDERS = dict()
34 for folder_name in ["train", "test", "valid"]:
35 rmtree(os.path.join(TARGET_DATA_DIR, folder_name), ignore_errors=True)
36 os.makedirs(os.path.join(TARGET_DATA_DIR, folder_name))
37
38 # Split records by 80-20 between Train and Validation Set
39 filenames = np.random.permutation(glob(os.path.join(SOURCE_DATA_DIR, "*.jpg")))
40
41 train_idx = int(len(filenames) * train_size)
42 test_idx = int(len(filenames) * (train_size+valid_size))
43 for idx, filename in enumerate(filenames):
44 target_name = filename.split("/")[-1]
45 if idx < train_idx:
46 target_filepath = os.path.join(TARGET_DATA_DIR, "train", target_name)
47 elif idx < test_idx:
48 target_filepath = os.path.join(TARGET_DATA_DIR, "valid", target_name)
49 else:
50 target_filepath = os.path.join(TARGET_DATA_DIR, "test", target_name)
51 copyfile(filenames[idx], target_filepath)
36def split_data():
37 split_rate_ = 0.9
38 dir_train_file_idx_ = 'aid_data/train_file_idx.txt'
39 filelist_ = ['raw_data/part-%d' % x for x in range(len(os.listdir('raw_data')))]
40
41 if not os.path.exists(dir_train_file_idx_):
42 train_file_idx = list(
43 numpy.random.choice(
44 len(filelist_), int(len(filelist_) * split_rate_), False))
45 with open(dir_train_file_idx_, 'w') as fout:
46 fout.write(str(train_file_idx))
47 else:
48 with open(dir_train_file_idx_, 'r') as fin:
49 train_file_idx = eval(fin.read())
50
51 for idx in range(len(filelist_)):
52 if idx in train_file_idx:
53 shutil.move(filelist_[idx], 'train_data')
54 else:
55 shutil.move(filelist_[idx], 'test_data')
14def read_train_data():
15 trainData = []
16 trainLabel = []
17 n, m = map(int, raw_input().split())
18 for _ in xrange(n):
19 data = raw_input().split()
20 data.pop(0)
21 trainLabel.append(int(data[0]))
22 data.pop(0)
23 trainData.append({int(x.split(':')[0]) : float(x.split(':')[1]) for x in data for e in x.split(':') })
24 trainData = transformer.fit_transform(trainData).toarray()
25 return trainData, trainLabel
355@classmethod
356def training_split(cls,
357 dataset_folder,
358 number_of_validation_examples=500,
359 maximum_disparity=255):
360 """Returns training and validation datasets.
361
362 Example from FlyingThings3d dataset is added to the training
363 or validation datasets if:
364
365 (1) it is training example of FlyingThings3d dataset;
366 (2) it does not have rendering artifacts;
367 (3) all its disparities are within the range [0, maximum_disparity].
368
369 Args:
370 dataset_folder: folder with FlyingThings3D dataset, that contains
371 "frames_cleanpass" folder with left and right
372 images and "disparity" folder with disparities.
373 number_of_validation_examples: number of examples from training set
374 that will be used for validation.
375 maximum_disparity: maximum disparity in training / validation
376 dataset. All training examples with disparity
377 larger than "maximum_disparity" are excluded
378 from the dataset.
379 """
380 examples = _find_examples(dataset_folder)
381 # Manual random seed garantees that splits will be same in a
382 # different runs.
383 random.seed(0)
384 random.shuffle(examples)
385 examples = _split_examples_into_training_and_test_sets(examples)[0]
386 examples = _filter_out_examples_with_rendering_artifacts(examples)
387 examples = _filter_out_examples_with_large_disparities(
388 examples, maximum_disparity)
389 _dataset = FlyingThings3D(examples)
390 validation_dataset, training_dataset = _dataset.split_in_two(
391 size_of_first_subset=number_of_validation_examples)
392 return training_dataset, validation_dataset
6def split(df):
7 '''
8
9 :param df: Dataframe to be splited
10 :return: Sorted list of dataframe's splited list
11 '''
12 trainingSet, testSet = train_test_split(df, test_size=0.2)
13 sorted_trainSet = trainingSet.sort_values('user_id')
14 sorted_testSet = testSet.sort_values('user_id')
15 return sorted_testSet, sorted_trainSet
50def getTrainFeaturesAndLabels(data):
51 '''
52 getTrainFeaturesAndLabels() scales features and encodes labels using
53 globally defined variables of the LabelEncoder() and StandardScaler()
54 objects
55 '''
56 #set up scaler and encoder
57 le=LabelEncoder()
58 scaler=StandardScaler()
59 le.fit(list(set(data.proto)))
60 scaler.fit(data.drop('proto',axis=1) )
61 #scale feature and encode labels
62 features=scaler.transform(data.drop('proto',axis=1))
63 target=le.transform(data.proto)
64 return features,target,le,scaler

Related snippets