-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspeakart.py
190 lines (124 loc) · 5.16 KB
/
speakart.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import os
import time
from tkinter.ttk import Style
import requests
from io import BytesIO
import numpy as np
from PIL import Image
import tkinter
import matplotlib
import pandas as pd
import PIL.Image
import streamlit as st
from docarray import Document
import streamlit.components.v1 as components
from IPython.display import Image, display
#matplotlib.use('MacOSX')
st.set_page_config(page_title="streamlit_audio_recorder")
st.markdown('''<style>.css-1egvi7u {margin-top: -3rem;}</style>''',
unsafe_allow_html=True)
st.markdown('''<style>.stAudio {height: 45px;}</style>''',
unsafe_allow_html=True)
st.markdown('''<style>.css-v37k9u a {color: #ff4c4b;}</style>''',
unsafe_allow_html=True) # darkmode
st.markdown('''<style>.css-nlntq9 a {color: #ff4c4b;}</style>''',
unsafe_allow_html=True) # lightmode
def record_audio():
parent_dir = os.path.dirname(os.path.abspath(__file__))
build_dir = os.path.join(parent_dir, "st_audiorec/frontend/build")
st_audiorec = components.declare_component("st_audiorec", path=build_dir)
st.title("Speaking Art into Existence \U0001F58C")
st.markdown(
f'Powered by <img src="https://www.assemblyai.com/_next/static/media/light.f35331fb.svg" width="100" height="100">',
unsafe_allow_html=True)
st.markdown('An AssemblyAI Speech-to-Text API Demo Implemented by '
'[Tobi John (MLOps_engineer)](https://twitter.com/MLOps_engineer) - '
'view project source code on '
'[GitHub](/~https://github.com/tobsiee/AssemblyAI)')
st.write('\n\n')
# STREAMLIT AUDIO RECORDER Instance
val = st_audiorec()
# web component returns arraybuffer from WAV-blob
if isinstance(val, dict):
with st.spinner('retrieving audio-recording...'):
ind, val = zip(*val['arr'].items())
ind = np.array(ind, dtype=int)
val = np.array(val)
sorted_ints = val[ind]
stream = BytesIO(b"".join([int(v).to_bytes(1, "big") for v in sorted_ints]))
wav_bytes = stream.read()
# wav_bytes contains audio data in format to be further processed
# display audio data as received on the Python side
#st.audio(wav_bytes, format='audio/wav')
with open('input.wav', 'wb') as f:
f.write(wav_bytes)
assemblyai_key = os.environ.get('ASSEMBLYAI_API_KEY')
upload_endpoint = "https://api.assemblyai.com/v2/upload"
transcripting_endpoint = "https://api.assemblyai.com/v2/transcript"
headers = {
'Authorization': assemblyai_key,
'content-type': 'application/json',
}
def assemblyai_upload(file):
def get_upload_url(file):
with open(file, 'rb') as f:
while True:
data = f.read(5242880)
if not data:
break
yield data
upload_response = requests.post(upload_endpoint, headers=headers, data=get_upload_url(file))
#st.write()
return upload_response.json()['upload_url']
def transcribe(upload_url):
json = {"audio_url": upload_url}
transcribe_response = requests.post(transcripting_endpoint, json=json, headers=headers)
transcription_id = transcribe_response.json()['id']
st.write('AssemblyAI has received your prompt and is now transcribing \U0001F642')
return transcription_id
def get_transcription_result(transcription_id):
current_status = "queued"
endpoint = f"https://api.assemblyai.com/v2/transcript/{transcription_id}"
while current_status not in ("completed", "error"):
response = requests.get(endpoint, headers=headers)
current_status = response.json()['status']
if current_status in ("completed", "error"):
return response.json()['text']
else:
time.sleep(12)
def call_dalle(prompt):
server_url = 'grpcs://dalle-flow.dev.jina.ai'
doc = Document(text=prompt).post(server_url, parameters={'num_images': 6})
da = doc.matches
da.plot_image_sprites(fig_size=(10,10), show_index=True)
fav_id = 4
fav = da[fav_id]
fav.embedding = doc.embedding
diffused = fav.post(f'{server_url}', parameters={'skip_rate': 0.6, 'num_images': 4}, target_executor='diffusion').matches
diffused.plot_image_sprites(fig_size=(2,2), show_index=True)
dfav_id = 1
fav = diffused[dfav_id]
fav = fav.post(f'{server_url}/upscale')
d =(
Document(uri=fav.uri)
.load_uri_to_image_tensor()
.set_image_tensor_shape(shape=(224, 224))
.set_image_tensor_normalization()
.set_image_tensor_channel_axis(-1, 0)
)
d.save_image_tensor_to_file('image.png', channel_axis=0)
image = PIL.Image.open('image.png')
st.image(image, caption="Generated Image", use_column_width=True)
def main():
record_audio()
# making sure the file has been updated
time.sleep(5)
upload_url = assemblyai_upload(file)
transcription_id = transcribe(upload_url)
prompt = get_transcription_result(transcription_id)
st.info(prompt)
call_dalle(prompt)
st.write('Your image has been generated as shown above \U0001F600')
if __name__ == "__main__":
file = "input.wav"
main()