-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata.py
97 lines (80 loc) · 3.88 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import logging
import util
import io_util
from functools import cached_property, lru_cache
from os.path import join, exists
import os
from data_util import (
get_all_docs,
convert_docs_to_features,
)
logger = logging.getLogger(__name__)
class DataProcessor:
def __init__(self, config):
self.config = config
@cached_property
def tokenizer(self):
return util.get_transformer_tokenizer(self.config)
@classmethod
def is_training(cls, partition):
return 'train' in partition or 'all' in partition
@lru_cache()
def get_meta(self, dataset_name):
meta_path = join(self.get_data_dir(dataset_name), 'meta.json')
meta = io_util.read_json(meta_path)
assert meta['attr2cluster']['brand'] == 0, 'Brand should have cluster id 0'
meta['cluster2attr'] = {cluster: attr for attr, cluster in meta['attr2cluster'].items()}
meta['special_attri'], meta['seed_attri'], meta['new_attri'] = set(meta['special_attri']), set(meta['seed_attri']), set(meta['new_attri'])
return meta
def get_data(self, dataset_name, partition):
doc_path = self.get_data_doc_path(dataset_name, partition)
feat_path = self.get_data_feature_path(dataset_name, partition)
conf, is_training = self.config, self.is_training(partition)
# Get docs
if exists(doc_path):
docs = io_util.read_jsonlines(doc_path)
logger.info(f'Loaded docs from {doc_path}')
else:
logger.info(f'Getting docs for {dataset_name}-{partition}...')
raw_path = self.get_data_raw_path(dataset_name, partition)
docs = get_all_docs(dataset_name, raw_path, self.get_meta(dataset_name),
self.tokenizer, only_title=conf['only_title'], is_training=is_training)
# Save
io_util.write_jsonlines(doc_path, docs)
logger.info(f'Saved docs to {doc_path}')
# Get features
if exists(feat_path):
features = io_util.read_pickle(feat_path)
logger.info(f'Loaded features from {feat_path}')
else:
logger.info(f'Getting features for {dataset_name}-{partition}...')
features = convert_docs_to_features(dataset_name, docs, self.tokenizer, max_seq_len=conf['max_seq_len'],
is_training=is_training, show_example=True)
# Save
io_util.write_pickle(feat_path, features)
logger.info(f'Saved features to {feat_path}')
return docs, features
def get_data_dir(self, dataset_name):
return join(self.config['dataset_dir'], dataset_name)
def get_data_raw_path(self, dataset_name, partition):
file_path = join(self.get_data_dir(dataset_name), f'{partition}.jsonlines')
return file_path
def get_data_doc_path(self, dataset_name, partition):
save_dir = join(self.config['data_dir'], 'processed')
os.makedirs(save_dir, exist_ok=True)
t = self.config['model_type']
title_bp = 'title' if self.config['only_title'] else 'title+bp'
save_path = join(save_dir, f'doc_{dataset_name}_{partition}_{title_bp}_{t}.jsonlines')
return save_path
def get_data_feature_path(self, dataset_name, partition):
save_dir = join(self.config['data_dir'], 'processed')
os.makedirs(save_dir, exist_ok=True)
t = self.config['model_type']
title_bp = 'title' if self.config['only_title'] else 'title+bp'
msl = self.config['max_seq_len']
save_path = join(save_dir, f'feat_{dataset_name}_{partition}_{title_bp}_{t}_max{msl}.bin')
return save_path
def get_results_path(self, dataset_name, partition, suffix, ext='json'):
save_dir = join(self.config['log_dir'], 'results')
os.makedirs(save_dir, exist_ok=True)
return join(save_dir, f'results_{dataset_name}_{partition}_{suffix}.{ext}')