-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathmodel_v40.py
135 lines (116 loc) · 3.62 KB
/
model_v40.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
NN model with glove embeddings
layers:
1. embedding layer (glove)
2. SpatialDropout1D (0.2)
3. bidirectional lstm & gru
4. [global_max_pooling1d, attention]
5. dense 128 (0.15 dropout) & 16
6. output (sigmoid)
"""
import os
import gc
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import (Input, Embedding, SpatialDropout1D, Bidirectional,
LSTM, GRU, GlobalMaxPool1D, Concatenate, Dropout,
Dense)
from keras.models import Model
from neural_networks import Attention
from neural_networks import NeuralNetworkClassifier
from tqdm import tqdm
tqdm.pandas()
# model configs
MAX_FEATURES = int(2.5e5) # total word count = 227,538; clean word count = 186,551 # noqa
MAX_LEN = 75 # mean_len = 12; Q99_len = 40; max_len = 189;
RNN_UNITS = 40
DENSE_UNITS_1 = 128
DENSE_UNITS_2 = 16
# file configs
MODEL_FILEPATH = os.path.join(
os.environ['DATA_PATH'],
'models',
'model_v40.hdf5'
)
EMBED_FILEPATH = os.path.join(
os.environ['DATA_PATH'],
'embeddings',
'glove.840B.300d',
'glove.pkl'
)
def get_network(embed_filepath):
input_layer = Input(shape=(MAX_LEN, ), name='input')
# 1. embedding layer
# get embedding weights
print('load pre-trained embedding weights ......')
embed_weights = pd.read_pickle(embed_filepath)
input_dim = embed_weights.shape[0]
output_dim = embed_weights.shape[1]
x = Embedding(
input_dim=input_dim,
output_dim=output_dim,
weights=[embed_weights],
trainable=False,
name='embedding'
)(input_layer)
# clean up
del embed_weights, input_dim, output_dim
gc.collect()
# 2. dropout
x = SpatialDropout1D(rate=0.2)(x)
# 3. bidirectional lstm & gru
x = Bidirectional(
layer=LSTM(RNN_UNITS, return_sequences=True),
name='bidirectional_lstm'
)(x)
# (optional), get hidden states
x = Bidirectional(
layer=GRU(RNN_UNITS, return_sequences=True),
name='bidirectional_gru'
)(x)
# 4. concat global_max_pooling1d and attention
max_pool = GlobalMaxPool1D(name='global_max_pooling1d')(x)
atten = Attention(step_dim=MAX_LEN, name='attention')(x)
x = Concatenate(axis=-1)([max_pool, atten])
# 5. dense
x = Dense(units=DENSE_UNITS_1, activation='relu', name='dense_1')(x)
x = Dropout(rate=0.15)(x)
x = Dense(units=DENSE_UNITS_2, activation='relu', name='dense_2')(x)
# 6. output (sigmoid)
output_layer = Dense(units=1, activation='sigmoid', name='output')(x)
return Model(inputs=input_layer, outputs=output_layer)
def get_model():
print('build network ......')
model = get_network(embed_filepath=EMBED_FILEPATH)
print(model.summary())
return NeuralNetworkClassifier(
model,
balancing_class_weight=True,
filepath=MODEL_FILEPATH)
"""
text cleaning
"""
def preprocess(text):
from model_v30 import preprocess as preprocess_base
text = preprocess_base(text)
return text
def tokenize(df_text):
# preprocess
df_text = df_text.progress_apply(preprocess)
# tokenizer
tokenizer = Tokenizer(
num_words=MAX_FEATURES,
filters='',
lower=False,
split=' ')
# fit to data
tokenizer.fit_on_texts(list(df_text))
# tokenize the texts into sequences
sequences = tokenizer.texts_to_sequences(df_text)
return sequences, tokenizer
def transform(df_text):
seqs, _ = tokenize(df_text)
# pad the sentences
X = pad_sequences(seqs, maxlen=MAX_LEN, padding='pre', truncating='post')
return X