# -*- coding: utf-8 -*-
# Author: Zhi Qiao <mingshan_ai@163.com>
# License: BSD 2 clause
import os
import csv
import pickle
import random
import numpy as np
import pandas as pd
import tqdm
from tqdm._tqdm import trange
import time
try:
from ..utils.check import *
except:
from pyhealth.utils.check import *
[docs]class imagedata:
def __init__(self, expdata_id, root_dir='.'):
"""
experiment data generat class for cms datasets
Parameters
----------
exp_id : str, optional (default='init.test')
name of current experiment
"""
self.expdata_id = expdata_id
check_expdata_dir(expdata_id = expdata_id)
self.root_dir = root_dir
self.expdata_dir = os.path.join(self.root_dir, 'experiments_data', self.expdata_id)
print(
'Current ExpData_ID: {0} --- Target for CMS'.format(
self.expdata_id))
[docs] def get_exp_data(self,
sel_task='diagnose',
shuffle=True,
split_ratio=[0.64, 0.16, 0.2],
data_root = '',
n_limit = -1):
"""
Parameters
----------
task : str, optional (default='phenotyping')
name of current healthcare task
shuffle : bool, optional (default=True)
determine whether shuffle data or not
split_ratio : list, optional (default=[0.64,0.16,0.2])
used for split whole data into train/valid/test
data_root : str, (default='')
use data in data_root
n_limit : int, optional (default = -1)
used for sample N-data not for all data, if n_limit==-1, use all data
"""
self.sel_task = sel_task
if data_root == '':
raise Exception('fill in correct data_root')
all_list = []
l_list = []
episode_dir = os.path.join(data_root, 'x_data')
feat_n, label_n = 0, 0
label_seq = pd.read_csv(os.path.join(data_root, 'y_data',
self.sel_task + '.csv')).values
for row_id in trange(len(label_seq)):
if n_limit>0 and row_id>n_limit:
break
time.sleep(0.01)
row = label_seq[row_id, :]
concrete_path = os.path.join(episode_dir, row[0])
if os.path.exists(concrete_path) is False:
continue
all_list.append([concrete_path] + row[1:].astype(float).tolist())
label_n = len(row[1:])
# shuffle the list
if shuffle:
random.shuffle(all_list)
N = len(all_list)
x_list = []
y_list = []
for item in all_list:
x_list.append(item[0])
y_list.append(np.array(item[1:]).astype(float))
train_ratio = split_ratio[0]
valid_ratio = split_ratio[1]
training_x = x_list[: int(train_ratio * N)]
validing_x = x_list[int(train_ratio * N): int(
(train_ratio + valid_ratio) * N)]
testing_x = x_list[int((train_ratio + valid_ratio) * N):]
training_y = y_list[: int(train_ratio * N)]
validing_y = y_list[int(train_ratio * N): int(
(train_ratio + valid_ratio) * N)]
testing_y = y_list[int((train_ratio + valid_ratio) * N):]
if os.path.exists(self.expdata_dir) is False:
os.makedirs(self.expdata_dir)
pickle.dump(training_x, open(
os.path.join(self.expdata_dir, 'train_x.pkl'), 'wb'))
pickle.dump(validing_x, open(
os.path.join(self.expdata_dir, 'valid_x.pkl'), 'wb'))
pickle.dump(testing_x, open(
os.path.join(self.expdata_dir, 'test_x.pkl'), 'wb'))
print ('finished X generate')
pickle.dump(training_y, open(
os.path.join(self.expdata_dir, 'train_y.pkl'), 'wb'))
pickle.dump(validing_y, open(
os.path.join(self.expdata_dir, 'valid_y.pkl'), 'wb'))
pickle.dump(testing_y, open(
os.path.join(self.expdata_dir, 'test_y.pkl'), 'wb'))
print ('finished Y generate')
expdata_statistic = {
'task':self.sel_task,
'raio': split_ratio,
'label_n': label_n,
'len_train': len(training_x),
'len_valid': len(validing_x),
'len_test': len(testing_x)
}
pickle.dump(expdata_statistic, open(
os.path.join(self.expdata_dir, 'expdata_statistic.pkl'), 'wb'))
self.train = {'x': training_x, 'y': training_y, 'label_n': label_n}
self.valid = {'x': validing_x, 'y': validing_y, 'label_n': label_n}
self.test = {'x': testing_x, 'y': testing_y, 'label_n': label_n}
print('generate finished')
print('target Task:', expdata_statistic['task'])
print('N of labels:', expdata_statistic['label_n'])
print('N of TrainData:', expdata_statistic['len_train'])
print('N of ValidData:', expdata_statistic['len_valid'])
print('N of TestData:', expdata_statistic['len_test'])
[docs] def load_exp_data(self):
if os.path.exists(self.expdata_dir) is False:
raise Exception('cannot find exp data dir {0}'.format(self.expdata_dir))
training_x = pickle.load(open(
os.path.join(self.expdata_dir, 'train_x.pkl'), 'rb'))
validing_x = pickle.load(open(
os.path.join(self.expdata_dir, 'valid_x.pkl'), 'rb'))
testing_x = pickle.load(open(
os.path.join(self.expdata_dir, 'test_x.pkl'), 'rb'))
training_y = pickle.load(open(
os.path.join(self.expdata_dir, 'train_y.pkl'), 'rb'))
validing_y = pickle.load(open(
os.path.join(self.expdata_dir, 'valid_y.pkl'), 'rb'))
testing_y = pickle.load(open(
os.path.join(self.expdata_dir, 'test_y.pkl'), 'rb'))
expdata_statistic = pickle.load(open(
os.path.join(self.expdata_dir, 'expdata_statistic.pkl'), 'rb'))
label_n = expdata_statistic['label_n']
self.train = {'x': training_x, 'y': training_y, 'label_n': label_n}
self.valid = {'x': validing_x, 'y': validing_y, 'label_n': label_n}
self.test = {'x': testing_x, 'y': testing_y, 'label_n': label_n}
print('load finished')
print('target Task:', expdata_statistic['task'])
print('N of labels:', expdata_statistic['label_n'])
print('N of TrainData:', expdata_statistic['len_train'])
print('N of ValidData:', expdata_statistic['len_valid'])
print('N of TestData:', expdata_statistic['len_test'])
[docs] def show_data(self, k=3):
"""
Parameters
----------
k : int, optional (default=3)
fetch k sample data for show
"""
print('------------Train--------------')
print('x_data', self.train['x'][:k])
print('y_data', self.train['y'][:k])
print('------------Valid--------------')
print('x_data', self.valid['x'][:k])
print('y_data', self.valid['y'][:k])
print('------------Test--------------')
print('x_data', self.test['x'][:k])
print('y_data', self.test['y'][:k])
[docs]class sequencedata:
def __init__(self, expdata_id, root_dir='.'):
"""
experiment data generat class for cms datasets
Parameters
----------
exp_id : str, optional (default='init.test')
name of current experiment
"""
self.expdata_id = expdata_id
check_expdata_dir(expdata_id = expdata_id)
self.root_dir = root_dir
self.expdata_dir = os.path.join(self.root_dir, 'experiments_data', self.expdata_id)
print(
'Current ExpData_ID: {0} --- Target for MIMIC'.format(
self.expdata_id))
[docs] def get_exp_data(self,
sel_task='phenotyping',
shuffle=True,
split_ratio=[0.64, 0.16, 0.2],
data_root = '',
n_limit = -1):
"""
Parameters
----------
task : str, optional (default='phenotyping')
name of current healthcare task
shuffle : bool, optional (default=True)
determine whether shuffle data or not
split_ratio : list, optional (default=[0.64,0.16,0.2])
used for split whole data into train/valid/test
data_root : str, optional (default='')
if data_root=='', use data in ./datasets; else use data in data_root
n_limit : int, optional (default = -1)
used for sample N-data not for all data, if n_limit==-1, use all data
"""
self.sel_task = sel_task
if data_root == '':
raise Exception('fill in correct data_root')
all_list = []
l_list = []
episode_dir = os.path.join(data_root, 'x_data')
feat_n, label_n = 0, 0
label_seq = pd.read_csv(os.path.join(data_root, 'y_data',
self.sel_task + '.csv')).values
for row_id in trange(len(label_seq)):
if n_limit>0 and row_id>n_limit:
break
time.sleep(0.01)
row = label_seq[row_id, :]
concrete_path = os.path.join(episode_dir, row[0])
if os.path.exists(concrete_path) is False:
continue
seq_l, feat_n_all = pd.read_csv(concrete_path).shape
if seq_l < 2:
continue
all_list.append([concrete_path] + [seq_l] + row[1:].astype(float).tolist())
label_n = len(row[1:])
feat_n = feat_n_all - 1
# shuffle the list
if shuffle:
random.shuffle(all_list)
N = len(all_list)
x_list = []
y_list = []
l_list = []
for item in all_list:
x_list.append(item[0])
l_list.append(item[1])
y_list.append(np.array(item[2:]).astype(float))
train_ratio = split_ratio[0]
valid_ratio = split_ratio[1]
training_x = x_list[: int(train_ratio * N)]
validing_x = x_list[int(train_ratio * N): int(
(train_ratio + valid_ratio) * N)]
testing_x = x_list[int((train_ratio + valid_ratio) * N):]
training_y = y_list[: int(train_ratio * N)]
validing_y = y_list[int(train_ratio * N): int(
(train_ratio + valid_ratio) * N)]
testing_y = y_list[int((train_ratio + valid_ratio) * N):]
training_l = l_list[: int(train_ratio * N)]
validing_l = l_list[int(train_ratio * N): int(
(train_ratio + valid_ratio) * N)]
testing_l = l_list[int((train_ratio + valid_ratio) * N):]
if os.path.exists(self.expdata_dir) is False:
os.makedirs(self.expdata_dir)
pickle.dump(training_x, open(
os.path.join(self.expdata_dir, 'train_x.pkl'), 'wb'))
pickle.dump(validing_x, open(
os.path.join(self.expdata_dir, 'valid_x.pkl'), 'wb'))
pickle.dump(testing_x, open(
os.path.join(self.expdata_dir, 'test_x.pkl'), 'wb'))
print ('finished X generate')
pickle.dump(training_y, open(
os.path.join(self.expdata_dir, 'train_y.pkl'), 'wb'))
pickle.dump(validing_y, open(
os.path.join(self.expdata_dir, 'valid_y.pkl'), 'wb'))
pickle.dump(testing_y, open(
os.path.join(self.expdata_dir, 'test_y.pkl'), 'wb'))
print ('finished Y generate')
pickle.dump(training_l, open(
os.path.join(self.expdata_dir, 'train_l.pkl'), 'wb'))
pickle.dump(validing_l, open(
os.path.join(self.expdata_dir, 'valid_l.pkl'), 'wb'))
pickle.dump(testing_l, open(
os.path.join(self.expdata_dir, 'test_l.pkl'), 'wb'))
print ('finished L generate')
expdata_statistic = {
'task':self.sel_task,
'raio': split_ratio,
'feat_n': feat_n,
'label_n': label_n,
'len_train': len(training_x),
'len_valid': len(validing_x),
'len_test': len(testing_x)
}
pickle.dump(expdata_statistic, open(
os.path.join(self.expdata_dir, 'expdata_statistic.pkl'), 'wb'))
self.train = {'x': training_x, 'y': training_y, 'l': training_l,
'feat_n': feat_n, 'label_n': label_n}
self.valid = {'x': validing_x, 'y': validing_y, 'l': validing_l,
'feat_n': feat_n, 'label_n': label_n}
self.test = {'x': testing_x, 'y': testing_y, 'l': testing_l,
'feat_n': feat_n, 'label_n': label_n}
print('generate finished')
print('target Task:', expdata_statistic['task'])
print('N of features:', expdata_statistic['feat_n'])
print('N of labels:', expdata_statistic['label_n'])
print('N of TrainData:', expdata_statistic['len_train'])
print('N of ValidData:', expdata_statistic['len_valid'])
print('N of TestData:', expdata_statistic['len_test'])
[docs] def load_exp_data(self):
if os.path.exists(self.expdata_dir) is False:
raise Exception('cannot find exp data dir {0}'.format(self.expdata_dir))
training_x = pickle.load(open(
os.path.join(self.expdata_dir, 'train_x.pkl'), 'rb'))
validing_x = pickle.load(open(
os.path.join(self.expdata_dir, 'valid_x.pkl'), 'rb'))
testing_x = pickle.load(open(
os.path.join(self.expdata_dir, 'test_x.pkl'), 'rb'))
training_y = pickle.load(open(
os.path.join(self.expdata_dir, 'train_y.pkl'), 'rb'))
validing_y = pickle.load(open(
os.path.join(self.expdata_dir, 'valid_y.pkl'), 'rb'))
testing_y = pickle.load(open(
os.path.join(self.expdata_dir, 'test_y.pkl'), 'rb'))
training_l = pickle.load(open(
os.path.join(self.expdata_dir, 'train_l.pkl'), 'rb'))
validing_l = pickle.load(open(
os.path.join(self.expdata_dir, 'valid_l.pkl'), 'rb'))
testing_l = pickle.load(open(
os.path.join(self.expdata_dir, 'test_l.pkl'), 'rb'))
expdata_statistic = pickle.load(open(
os.path.join(self.expdata_dir, 'expdata_statistic.pkl'), 'rb'))
feat_n = expdata_statistic['feat_n']
label_n = expdata_statistic['label_n']
self.train = {'x': training_x, 'y': training_y, 'l': training_l,
'feat_n': feat_n, 'label_n': label_n}
self.valid = {'x': validing_x, 'y': validing_y, 'l': validing_l,
'feat_n': feat_n, 'label_n': label_n}
self.test = {'x': testing_x, 'y': testing_y, 'l': testing_l,
'feat_n': feat_n, 'label_n': label_n}
print('load finished')
print('target Task:', expdata_statistic['task'])
print('N of features:', expdata_statistic['feat_n'])
print('N of labels:', expdata_statistic['label_n'])
print('N of TrainData:', expdata_statistic['len_train'])
print('N of ValidData:', expdata_statistic['len_valid'])
print('N of TestData:', expdata_statistic['len_test'])
[docs] def show_data(self, k=3):
"""
Parameters
----------
k : int, optional (default=3)
fetch k sample data for show
"""
print('------------Train--------------')
print('x_data', self.train['x'][:k])
print('y_data', self.train['y'][:k])
print('l_data', self.train['l'][:k])
print('------------Valid--------------')
print('x_data', self.valid['x'][:k])
print('y_data', self.valid['y'][:k])
print('l_data', self.valid['l'][:k])
print('------------Test--------------')
print('x_data', self.test['x'][:k])
print('y_data', self.test['y'][:k])
print('l_data', self.test['l'][:k])
[docs]class ecgdata:
def __init__(self, expdata_id, root_dir='.'):
"""
experiment data generat class for cms datasets
Parameters
----------
exp_id : str, optional (default='init.test')
name of current experiment
"""
self.expdata_id = expdata_id
check_expdata_dir(expdata_id = expdata_id)
self.root_dir = root_dir
self.expdata_dir = os.path.join(self.root_dir, 'experiments_data', self.expdata_id)
print(
'Current ExpData_ID: {0} --- Target for ECG'.format(
self.expdata_id))
[docs] def get_exp_data(self,
sel_task='diagnose',
shuffle=True,
split_ratio=[0.64, 0.16, 0.2],
data_root = '',
n_limit = -1):
"""
Parameters
----------
task : str, optional (default='phenotyping')
name of current healthcare task
shuffle : bool, optional (default=True)
determine whether shuffle data or not
split_ratio : list, optional (default=[0.64,0.16,0.2])
used for split whole data into train/valid/test
data_root : str, optional (default='')
if data_root=='', use data in ./datasets; else use data in data_root
n_limit : int, optional (default = -1)
used for sample N-data not for all data, if n_limit==-1, use all data
"""
self.sel_task = sel_task
if data_root == '':
raise Exception('fill in correct data_root')
all_list = []
l_list = []
episode_dir = os.path.join(data_root, 'x_data')
feat_n, label_n = 0, 0
feat_seq = pickle.load(open(os.path.join(data_root, 'x_data', 'feat.pkl'), 'rb'))
label_seq = pickle.load(open(os.path.join(data_root, 'y_data', self.sel_task + '.pkl'), 'rb'))
label_n = np.shape(label_seq)[1]
feat_n = np.shape(feat_seq)[1]
for cur_i, each_label in enumerate(label_seq):
all_list.append(each_label.tolist() + feat_seq[cur_i].tolist())
# shuffle the list
if shuffle:
random.shuffle(all_list)
N = len(all_list)
x_list = []
y_list = []
for item in all_list:
x_list.append(np.array(item[label_n:]).astype(float))
y_list.append(np.array(item[:label_n]).astype(float))
train_ratio = split_ratio[0]
valid_ratio = split_ratio[1]
training_x = x_list[: int(train_ratio * N)]
validing_x = x_list[int(train_ratio * N): int(
(train_ratio + valid_ratio) * N)]
testing_x = x_list[int((train_ratio + valid_ratio) * N):]
training_y = y_list[: int(train_ratio * N)]
validing_y = y_list[int(train_ratio * N): int(
(train_ratio + valid_ratio) * N)]
testing_y = y_list[int((train_ratio + valid_ratio) * N):]
if os.path.exists(self.expdata_dir) is False:
os.makedirs(self.expdata_dir)
pickle.dump(training_x, open(
os.path.join(self.expdata_dir, 'train_x.pkl'), 'wb'))
pickle.dump(validing_x, open(
os.path.join(self.expdata_dir, 'valid_x.pkl'), 'wb'))
pickle.dump(testing_x, open(
os.path.join(self.expdata_dir, 'test_x.pkl'), 'wb'))
print ('finished X generate')
pickle.dump(training_y, open(
os.path.join(self.expdata_dir, 'train_y.pkl'), 'wb'))
pickle.dump(validing_y, open(
os.path.join(self.expdata_dir, 'valid_y.pkl'), 'wb'))
pickle.dump(testing_y, open(
os.path.join(self.expdata_dir, 'test_y.pkl'), 'wb'))
print ('finished Y generate')
expdata_statistic = {
'task':self.sel_task,
'raio': split_ratio,
'feat_n': feat_n,
'label_n': label_n,
'len_train': len(training_x),
'len_valid': len(validing_x),
'len_test': len(testing_x)
}
pickle.dump(expdata_statistic, open(
os.path.join(self.expdata_dir, 'expdata_statistic.pkl'), 'wb'))
self.train = {'x': training_x, 'y': training_y,
'feat_n': feat_n, 'label_n': label_n}
self.valid = {'x': validing_x, 'y': validing_y,
'feat_n': feat_n, 'label_n': label_n}
self.test = {'x': testing_x, 'y': testing_y,
'feat_n': feat_n, 'label_n': label_n}
print('generate finished')
print('target Task:', expdata_statistic['task'])
print('N of features:', expdata_statistic['feat_n'])
print('N of labels:', expdata_statistic['label_n'])
print('N of TrainData:', expdata_statistic['len_train'])
print('N of ValidData:', expdata_statistic['len_valid'])
print('N of TestData:', expdata_statistic['len_test'])
[docs] def load_exp_data(self):
if os.path.exists(self.expdata_dir) is False:
raise Exception('cannot find exp data dir {0}'.format(self.expdata_dir))
training_x = pickle.load(open(
os.path.join(self.expdata_dir, 'train_x.pkl'), 'rb'))
validing_x = pickle.load(open(
os.path.join(self.expdata_dir, 'valid_x.pkl'), 'rb'))
testing_x = pickle.load(open(
os.path.join(self.expdata_dir, 'test_x.pkl'), 'rb'))
training_y = pickle.load(open(
os.path.join(self.expdata_dir, 'train_y.pkl'), 'rb'))
validing_y = pickle.load(open(
os.path.join(self.expdata_dir, 'valid_y.pkl'), 'rb'))
testing_y = pickle.load(open(
os.path.join(self.expdata_dir, 'test_y.pkl'), 'rb'))
expdata_statistic = pickle.load(open(
os.path.join(self.expdata_dir, 'expdata_statistic.pkl'), 'rb'))
feat_n = expdata_statistic['feat_n']
label_n = expdata_statistic['label_n']
self.train = {'x': training_x, 'y': training_y,
'feat_n': feat_n, 'label_n': label_n}
self.valid = {'x': validing_x, 'y': validing_y,
'feat_n': feat_n, 'label_n': label_n}
self.test = {'x': testing_x, 'y': testing_y,
'feat_n': feat_n, 'label_n': label_n}
print('load finished')
print('target Task:', expdata_statistic['task'])
print('N of features:', expdata_statistic['feat_n'])
print('N of labels:', expdata_statistic['label_n'])
print('N of TrainData:', expdata_statistic['len_train'])
print('N of ValidData:', expdata_statistic['len_valid'])
print('N of TestData:', expdata_statistic['len_test'])
[docs] def show_data(self, k=3):
"""
Parameters
----------
k : int, optional (default=3)
fetch k sample data for show
"""
print('------------Train--------------')
print('x_data', self.train['x'][:k])
print('y_data', self.train['y'][:k])
print('------------Valid--------------')
print('x_data', self.valid['x'][:k])
print('y_data', self.valid['y'][:k])
print('------------Test--------------')
print('x_data', self.test['x'][:k])
print('y_data', self.test['y'][:k])
[docs]class textdata:
def __init__(self, expdata_id, root_dir='.'):
"""
experiment data generat class for cms datasets
Parameters
----------
exp_id : str, optional (default='init.test')
name of current experiment
"""
self.expdata_id = expdata_id
check_expdata_dir(expdata_id = expdata_id)
self.root_dir = root_dir
self.expdata_dir = os.path.join(self.root_dir, 'experiments_data', self.expdata_id)
print(
'Current ExpData_ID: {0} --- Target for Clinical Notes'.format(
self.expdata_id))
[docs] def get_exp_data(self,
sel_task='diagnose',
shuffle=True,
split_ratio=[0.64, 0.16, 0.2],
data_root = '',
n_limit = -1):
"""
Parameters
----------
task : str, optional (default='phenotyping')
name of current healthcare task
shuffle : bool, optional (default=True)
determine whether shuffle data or not
split_ratio : list, optional (default=[0.64,0.16,0.2])
used for split whole data into train/valid/test
data_root : str, (default='')
use data in data_root
n_limit : int, optional (default = -1)
used for sample N-data not for all data, if n_limit==-1, use all data
"""
self.sel_task = sel_task
if data_root == '':
raise Exception('fill in correct data_root')
all_list = []
l_list = []
episode_dir = os.path.join(data_root, 'x_data')
feat_n, label_n = 0, 0
label_seq = pd.read_csv(os.path.join(data_root, 'y_data',
self.sel_task + '.csv')).values
for row_id in trange(len(label_seq)):
if n_limit>0 and row_id>n_limit:
break
time.sleep(0.01)
row = label_seq[row_id, :]
concrete_path = os.path.join(episode_dir, row[0])
if os.path.exists(concrete_path) is False:
continue
all_list.append([concrete_path] + row[1:].astype(float).tolist())
label_n = len(row[1:])
# shuffle the list
if shuffle:
random.shuffle(all_list)
N = len(all_list)
x_list = []
y_list = []
for item in all_list:
x_list.append(item[0])
y_list.append(np.array(item[1:]).astype(float))
train_ratio = split_ratio[0]
valid_ratio = split_ratio[1]
training_x = x_list[: int(train_ratio * N)]
validing_x = x_list[int(train_ratio * N): int(
(train_ratio + valid_ratio) * N)]
testing_x = x_list[int((train_ratio + valid_ratio) * N):]
training_y = y_list[: int(train_ratio * N)]
validing_y = y_list[int(train_ratio * N): int(
(train_ratio + valid_ratio) * N)]
testing_y = y_list[int((train_ratio + valid_ratio) * N):]
if os.path.exists(self.expdata_dir) is False:
os.makedirs(self.expdata_dir)
pickle.dump(training_x, open(
os.path.join(self.expdata_dir, 'train_x.pkl'), 'wb'))
pickle.dump(validing_x, open(
os.path.join(self.expdata_dir, 'valid_x.pkl'), 'wb'))
pickle.dump(testing_x, open(
os.path.join(self.expdata_dir, 'test_x.pkl'), 'wb'))
print ('finished X generate')
pickle.dump(training_y, open(
os.path.join(self.expdata_dir, 'train_y.pkl'), 'wb'))
pickle.dump(validing_y, open(
os.path.join(self.expdata_dir, 'valid_y.pkl'), 'wb'))
pickle.dump(testing_y, open(
os.path.join(self.expdata_dir, 'test_y.pkl'), 'wb'))
print ('finished Y generate')
expdata_statistic = {
'task':self.sel_task,
'raio': split_ratio,
'label_n': label_n,
'len_train': len(training_x),
'len_valid': len(validing_x),
'len_test': len(testing_x)
}
pickle.dump(expdata_statistic, open(
os.path.join(self.expdata_dir, 'expdata_statistic.pkl'), 'wb'))
self.train = {'x': training_x, 'y': training_y, 'label_n': label_n}
self.valid = {'x': validing_x, 'y': validing_y, 'label_n': label_n}
self.test = {'x': testing_x, 'y': testing_y, 'label_n': label_n}
print('generate finished')
print('target Task:', expdata_statistic['task'])
print('N of labels:', expdata_statistic['label_n'])
print('N of TrainData:', expdata_statistic['len_train'])
print('N of ValidData:', expdata_statistic['len_valid'])
print('N of TestData:', expdata_statistic['len_test'])
[docs] def load_exp_data(self):
if os.path.exists(self.expdata_dir) is False:
raise Exception('cannot find exp data dir {0}'.format(self.expdata_dir))
training_x = pickle.load(open(
os.path.join(self.expdata_dir, 'train_x.pkl'), 'rb'))
validing_x = pickle.load(open(
os.path.join(self.expdata_dir, 'valid_x.pkl'), 'rb'))
testing_x = pickle.load(open(
os.path.join(self.expdata_dir, 'test_x.pkl'), 'rb'))
training_y = pickle.load(open(
os.path.join(self.expdata_dir, 'train_y.pkl'), 'rb'))
validing_y = pickle.load(open(
os.path.join(self.expdata_dir, 'valid_y.pkl'), 'rb'))
testing_y = pickle.load(open(
os.path.join(self.expdata_dir, 'test_y.pkl'), 'rb'))
expdata_statistic = pickle.load(open(
os.path.join(self.expdata_dir, 'expdata_statistic.pkl'), 'rb'))
label_n = expdata_statistic['label_n']
self.train = {'x': training_x, 'y': training_y, 'label_n': label_n}
self.valid = {'x': validing_x, 'y': validing_y, 'label_n': label_n}
self.test = {'x': testing_x, 'y': testing_y, 'label_n': label_n}
print('load finished')
print('target Task:', expdata_statistic['task'])
print('N of labels:', expdata_statistic['label_n'])
print('N of TrainData:', expdata_statistic['len_train'])
print('N of ValidData:', expdata_statistic['len_valid'])
print('N of TestData:', expdata_statistic['len_test'])
[docs] def show_data(self, k=3):
"""
Parameters
----------
k : int, optional (default=3)
fetch k sample data for show
"""
print('------------Train--------------')
print('x_data', self.train['x'][:k])
print('y_data', self.train['y'][:k])
print('------------Valid--------------')
print('x_data', self.valid['x'][:k])
print('y_data', self.valid['y'][:k])
print('------------Test--------------')
print('x_data', self.test['x'][:k])
print('y_data', self.test['y'][:k])
if __name__ == '__main__':
print ('hello world')
test_txt = textdata('test.1.text')
test_txt.get_exp_data(sel_task='diagnose',data_root = './datasets/text')
test_txt.load_exp_data()