Skip to content

Commit

Permalink
Merge pull request #13 from ionite34/dev
Browse files Browse the repository at this point in the history
  • Loading branch information
ionite34 authored May 24, 2022
2 parents ed180c4 + 518b53c commit 2f767d8
Show file tree
Hide file tree
Showing 15 changed files with 117 additions and 54 deletions.
22 changes: 20 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ The pipeline employs a context layer, multiple transformer and n-gram morpho-ort
and an autoregressive recurrent neural transformer base. The current implementation offers state-of-the-art accuracy for out-of-vocabulary (OOV) words, as well as contextual
analysis for correct inferencing of [English Heteronyms](https://en.wikipedia.org/wiki/Heteronym_(linguistics)).

The package is offered in a pre-trained state that is ready for use as a dependency or in
The package is offered in a pre-trained state that is ready for [usage](#Usage) as a dependency or in
notebook environments. There are no additional resources needed, other than the model checkpoint which is
automatically downloaded on the first usage. See [Installation](#Installation) more information.

Expand Down Expand Up @@ -68,6 +68,8 @@ pip install aquila-resolve
## Usage

### 1. Module

```python
from Aquila_Resolve import G2p

Expand All @@ -77,13 +79,29 @@ g2p.convert('The book costs $5, will you read it?')
# >> '{DH AH0} {B UH1 K} {K AA1 S T S} {F AY1 V} {D AA1 L ER0 Z}, {W IH1 L} {Y UW1} {R IY1 D} {IH1 T}?'
```

> Additional optional parameters are available when defining a `G2p` instance:
> Optional parameters when defining a `G2p` instance:
| Parameter | Default | Description |
|-------------------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `device` | `'cpu'` | Device for Pytorch inference model. GPU is supported using `'cuda'` |

> Optional parameters when calling `convert`:
| Parameter | Default | Description |
|-------------------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `process_numbers` | `True` | Toggles conversion of some numbers and symbols to their spoken pronunciation forms. See [numbers.py](src/Aquila_Resolve/text/numbers.py) for details on what is covered. |

### 2. Command Line

A simple wrapper for text conversion is available through the `aquila-resolve` command
```
~
❯ aquila-resolve
✔ Aquila Resolve v0.1.2
? Text to convert: I read the book, did you read it?
{AY1} {R EH1 D} {DH AH0} {B UH1 K}, {D IH1 D} {Y UW1} {R IY1 D} {IH1 T}?
```

## Model Architecture

In evaluation[^1], neural G2P models have traditionally been extremely sensitive to orthographical variations
Expand Down
2 changes: 2 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,7 @@ torch~=1.11.0
inflect>=2.1.0
requests>=2.23.0
numpy>=1.18.0
inquirerpy>=0.3.3
yaspin>=2.1.0
pytest>=7.1.2
pytest_mock>=3.7.0
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ pywordsegment>=0.2.1
torch~=1.11.0
inflect>=2.1.0
requests>=2.23.0
numpy>=1.18.0
numpy>=1.18.0
inquirerpy>=0.3.3
yaspin>=2.1.0
12 changes: 9 additions & 3 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[metadata]
name = Aquila-Resolve
version = 0.1.2
version = 0.1.3
author = ionite
author_email = dev@ionite.io
description = Augmented Recurrent Neural Grapheme-to-Phoneme conversion with Inflectional Orthography.
description = Augmented Neural English G2p converter with Inflectional Orthography.
long_description = file: README.md
long_description_content_type = text/markdown
url = /~https://github.com/ionite34/Aquila-Resolve
Expand Down Expand Up @@ -32,11 +32,17 @@ install_requires =
inflect>=2.1.0
requests>=2.23.0
numpy>=1.18.0
inquirerpy>=0.3.3
yaspin>=2.1.0
zip_safe = False
include_package_data = True

[options.package_data]
* = *.json, *.json.gz

[options.packages.find]
where = src
where = src

[options.entry_points]
console_scripts =
aquila-resolve = Aquila_Resolve.cli:main_menu
2 changes: 1 addition & 1 deletion src/Aquila_Resolve/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Grapheme to Phoneme Resolver
"""
__version__ = "0.1.2"
__version__ = "0.1.3"

from .g2p import G2p
from .data.remote import download
4 changes: 4 additions & 0 deletions src/Aquila_Resolve/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .cli import main_menu

if __name__ == "__main__": # pragma: no cover
main_menu()
1 change: 1 addition & 0 deletions src/Aquila_Resolve/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .cli import main_menu
29 changes: 29 additions & 0 deletions src/Aquila_Resolve/cli/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# CLI Entry Point
import Aquila_Resolve
from InquirerPy import inquirer
from InquirerPy.utils import color_print as cp
from yaspin import yaspin


def main_menu():
"""
Aquila Resolve Entry Point
"""
g2p_convert()
exit(0)


def g2p_convert(): # pragma: no cover
"""
G2P Conversion
"""
with yaspin('Initializing Aquila Resolve Backend...', color='yellow') as sp:
g2p = Aquila_Resolve.G2p()
sp.ok(f'✔ Aquila Resolve v{Aquila_Resolve.__version__}')

while True:
text = inquirer.text("Text to convert:").execute()
if not text:
return
result = g2p.convert(text)
cp([('yellow', f'{result}')])
3 changes: 3 additions & 0 deletions src/Aquila_Resolve/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@


DATA_PATH = files(__name__)
CMU_FILE = DATA_PATH.joinpath("cmudict.json.gz")
HET_FILE = DATA_PATH.joinpath("heteronyms.json")
PT_FILE = DATA_PATH.joinpath("model.pt")
23 changes: 22 additions & 1 deletion src/Aquila_Resolve/data/remote.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Access and checks for remote data
import requests
import shutil
import nltk
from warnings import warn
from tqdm.auto import tqdm
from . import DATA_PATH
Expand Down Expand Up @@ -54,6 +55,20 @@ def ensure_download() -> None:
"Aquila_Resolve/data/ folder.")


def ensure_nltk() -> None: # pragma: no cover
"""Ensures all required NLTK Data is installed"""
required = {
'wordnet': 'corpora/wordnet.zip',
'omw-1.4': 'corpora/omw-1.4.zip',
'averaged_perceptron_tagger': 'taggers/averaged_perceptron_tagger.zip',
}
for name, url in required.items():
try:
nltk.data.find(url)
except LookupError:
nltk.download(name, raise_on_error=True)


def check_updates() -> None:
"""Checks if the model matches the latest checksum"""
if not check_model():
Expand All @@ -62,7 +77,13 @@ def check_updates() -> None:


def get_checksum(file: str, block_size: int = 65536) -> str:
"""Calculates the checksum of a file"""
"""
Calculates the Sha256 checksum of a file
:param file: Path to file
:param block_size: Block size for reading
:return: Checksum of file
"""
import hashlib
s = hashlib.sha256()
with open(file, 'rb') as f:
Expand Down
17 changes: 4 additions & 13 deletions src/Aquila_Resolve/g2p.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from functools import lru_cache

import pywordsegment
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

Expand All @@ -17,27 +16,21 @@
from .processors import Processor
from .infer import Infer
from .symbols import contains_alpha, valid_braces
from .data.remote import ensure_nltk

re_digit = re.compile(r"\((\d+)\)")
re_bracket_with_digit = re.compile(r"\(.*\)")
re_phonemes = re.compile(r'\{.*?}')

# Check that the nltk data is downloaded, if not, download it
try:
nltk.data.find('corpora/wordnet.zip')
nltk.data.find('corpora/omw-1.4.zip')
except LookupError:
nltk.download('wordnet')
nltk.download('omw-1.4')


class G2p:
def __init__(self, device: str = 'cpu'):
# noinspection GrazieInspection
"""
Grapheme to Phoneme conversion
Initialize the G2p converter.
:param device: Pytorch device.
"""
ensure_nltk() # Ensure nltk data is downloaded
self.dict = get_cmudict() # CMU Dictionary
self.h2p = H2p(preload=True) # H2p parser
self.lemmatize = WordNetLemmatizer().lemmatize # WordNet Lemmatizer - used to find singular form
Expand Down Expand Up @@ -65,7 +58,6 @@ def __init__(self, device: str = 'cpu'):

@lru_cache(maxsize=None)
def lookup(self, text: str, pos: str = None) -> str | None:
# noinspection GrazieInspection
"""
Gets the CMU Dictionary entry for a word.
Expand Down Expand Up @@ -134,7 +126,6 @@ def lookup(self, text: str, pos: str = None) -> str | None:
return None

def convert(self, text: str, convert_num: bool = True) -> str | None:
# noinspection GrazieInspection
"""
Replace a grapheme text line with phonemes.
Expand Down
15 changes: 5 additions & 10 deletions src/Aquila_Resolve/h2p.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
from nltk import pos_tag_sents
from .data.remote import ensure_nltk
from .dictionary import Dictionary
from .filter import filter_text as ft
from .text.replace import replace_first
from . import format_ph as ph

# Check required nltk data exists, if not, download it
try:
from nltk.data import find
find('taggers/averaged_perceptron_tagger.zip')
except LookupError: # pragma: no cover
from nltk.downloader import download
download('averaged_perceptron_tagger', raise_on_error=True)


class H2p:
def __init__(self, dict_path=None, preload=False, phoneme_format=''):
Expand All @@ -29,6 +22,8 @@ def __init__(self, dict_path=None, preload=False, phoneme_format=''):
:param preload: Preloads the tokenizer and tagger during initialization
:type preload: bool
"""
# Ensure nltk data is available
ensure_nltk()

# Supported phoneme formats
self.phoneme_format = phoneme_format
Expand Down Expand Up @@ -87,9 +82,9 @@ def replace_het_list(self, text_list):
# Get pos tags list
tags_list = pos_tag_sents(list_sentence_words)
# Loop through lines
for index in range(len(tags_list)):
for index, line in enumerate(tags_list):
# Loop through words and pos tags in tags_list index
for word, pos in tags_list[index]:
for word, pos in line:
# Skip if word not in dictionary
if not self.dict.contains(word):
continue
Expand Down
8 changes: 3 additions & 5 deletions src/Aquila_Resolve/infer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations
from .models.dp.phonemizer import Phonemizer
from .data import DATA_PATH
from .data.remote import ensure_download, check_updates
from .data import PT_FILE
from .data.remote import ensure_download
from .models import MODELS_PATH
import sys

Expand All @@ -11,9 +11,7 @@
class Infer:
def __init__(self, device='cpu'):
ensure_download() # Download checkpoint if necessary
check_updates() # Check for checkpoint updates
checkpoint_path = DATA_PATH.joinpath('model.pt')
self.model = Phonemizer.from_checkpoint(checkpoint_path, device=device)
self.model = Phonemizer.from_checkpoint(PT_FILE, device=device)
self.lang = 'en_us'
self.batch_size = 32

Expand Down
21 changes: 3 additions & 18 deletions src/Aquila_Resolve/processors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Transformations of text sequences for matching
from __future__ import annotations
from typing import TYPE_CHECKING
from collections import defaultdict

import re

Expand All @@ -18,25 +19,9 @@ def __init__(self, g2p: G2p):
self._tag = g2p.h2p.tag
self._stem = g2p.stem
# Number of times respective methods were called
self.stat_hits = {
'possessives': 0,
'contractions': 0,
'hyphenated': 0,
'compound': 0,
'plural': 0,
'stem': 0,
'inference': 0
}
self.stat_hits = defaultdict(int)
# Number of times respective methods returned value (not None)
self.stat_resolves = {
'possessives': 0,
'contractions': 0,
'hyphenated': 0,
'compound': 0,
'plural': 0,
'stem': 0,
'inference': 0
}
self.stat_resolves = defaultdict(int)

def auto_possessives(self, word: str) -> str | None:
"""
Expand Down
8 changes: 8 additions & 0 deletions tests/cli/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pytest
from Aquila_Resolve.cli import cli


def test_main_menu(mocker):
mocker.patch.object(cli, 'g2p_convert', return_value=None)
with pytest.raises(SystemExit):
cli.main_menu()

0 comments on commit 2f767d8

Please sign in to comment.