-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMain.py
146 lines (121 loc) · 5.06 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
path = 'captcha-images'
vocab_size = 62 + 1
max_caption_len = 10
labeled_samples = {}
unlabeled_samples = {}
for possibledir in os.listdir(path):
if not possibledir.startswith('.'):
if os.path.isdir(os.path.join(path, possibledir)):
contents_of_dir = [file for file in os.listdir(os.path.join(path, possibledir))]
contents_of_dir.sort()
if "control.txt" in contents_of_dir:
labeled_samples[possibledir] = [file for file in contents_of_dir if ".png" in file or ".jpeg" in file or ".jpg" in file or ".gif" in file]
else:
unlabeled_samples[possibledir] = [file for file in contents_of_dir if ".png" in file or ".jpeg" in file or ".jpg" in file or ".gif" in file]
for k, v in labeled_samples.items():
f = open(path + "/" + k + "/control.txt")
answers = f.readlines()
answers = [answer.replace("\n", "") for answer in answers]
labeled_samples[k] = zip(labeled_samples[k], answers)
from scipy import misc
from scipy.ndimage import imread
for k in labeled_samples:
for i in range(len(list(labeled_samples[k]))):
labeled_samples[k][i] = tuple([misc.imresize(imread(path + "/" + k + "/" + labeled_samples[k][i][0], mode='RGB'),
size=(128, 16)).flatten()/float(256.0),
labeled_samples[k][i][1]])
training_samples = {}
validation_samples = {}
ratio = 0.9
for k in labeled_samples:
training_samples[k] = labeled_samples[k][:int(ratio*len(labeled_samples[k]))]
validation_samples[k] = labeled_samples[k][int(ratio*len(labeled_samples[k])):]
for k in unlabeled_samples:
for i in range(len(list(unlabeled_samples[k]))):
unlabeled_samples[k][i] = misc.imresize(imread(path + "/" + k + "/" + unlabeled_samples[k][i], mode='RGB'),
size=(128, 16)).flatten()/float(256.0)
# For displaying captcha
# import matplotlib
# matplotlib.use('macosx')
# import matplotlib.pyplot as plt
# plt.imshow(labeled_samples['xanga'][72][0])
# print labeled_samples['xanga'][72][1]
# plt.show()
import numpy
import random
import sys
import string
rand_seed_value = 0
mapping = {}
idx = 0
for c in string.digits:
mapping[c] = ord(c) - 48
idx += len(string.digits)
for c in string.uppercase:
mapping[c] = ord(c) - 65 + idx
idx += len(string.uppercase)
for c in string.ascii_lowercase:
mapping[c] = ord(c) - 97 + idx
def get_batch(size, partial_answer=False, step=False):
img_list = []
answer_list = []
partial_answer_list = []
rest_answer_list = []
global rand_seed_value
random.seed(rand_seed_value)
for i in range(size):
rand_key = labeled_samples.keys()[random.randint(0, len(labeled_samples)) - 1]
rand_ind = random.randint(0, len(list(labeled_samples[rand_key])) - 1)
img_list.append(labeled_samples[rand_key][rand_ind][0])
if not partial_answer:
answer_list.append(labeled_samples[rand_key][rand_ind][1])
else:
rand_end_char = random.randint(0, len(labeled_samples[rand_key][rand_ind][1]) - 1)
partial_answer_t = [len(mapping.keys())] * (max_caption_len - rand_end_char)
partial_answer_t = partial_answer_t + [mapping[c]
for c in labeled_samples[rand_key][rand_ind][1][:rand_end_char]]
partial_answer_list.append(partial_answer_t)
rest_answer_list.append(numpy.eye(vocab_size)[mapping[labeled_samples[rand_key][rand_ind][1][rand_end_char]]])
if step:
rand_seed_value = random.randint(0, sys.maxint)
if not partial_answer:
return [numpy.array(img_list, dtype=float), answer_list]
else:
return [numpy.array(img_list, dtype=float), numpy.array(partial_answer_list), numpy.array(rest_answer_list)]
a,b,c = get_batch(2, True, True)
print b
for num in b[0]:
if num <= 9:
num = num + ord('0')
elif num <= (9 + 26):
num = num + ord('A') - 10
else:
num = num + ord('a') - 36
print chr(num)
num = 0
for i in range(0, 63):
if c[0][i] == 1:
num = i
if num <= 9:
num = num + ord('0')
elif num <= (9+26):
num = num + ord('A') - 10
else:
num = num + ord('a') - 36
print chr(num)
def get_test_batch(size):
img_list = []
for i in range(size):
rand_key = unlabeled_samples.keys()[numpy.random.randint(low=0, high=len(unlabeled_samples))]
rand_ind = numpy.random.randint(low=0, high=len(list(unlabeled_samples[rand_key])))
img_list.append(unlabeled_samples[rand_key][rand_ind])
return numpy.array(img_list, dtype=float), numpy.array(img_list, dtype=float)
#
# def get_sample():
# global labeled_samples
# labeled_samples = labeled_samples
# while True:
# rand_key = labeled_samples.keys()[numpy.random.randint(low=0, high=len(labeled_samples))]
# rand_ind = numpy.random.randint(low=0, high=len(list(labeled_samples[rand_key])))
# yield numpy.array(labeled_samples[rand_key][rand_ind][0]).reshape(1, 2048*3), numpy.array(labeled_samples[rand_key][rand_ind][0]).reshape(1, 2048*3)