Skip to content

Commit

Permalink
fix inconsistency re upgrading to transformer 4.0.0 (#124)
Browse files Browse the repository at this point in the history
* update transformers==2.10.0 to transformers==4.0.0rc1

* use_fast=False in tokenizer

* update requirement.txt and environment.yml

* add sentencepiece dependency

* change T5 padding to 'longest'
  • Loading branch information
MXueguang authored Dec 2, 2020
1 parent aed21c0 commit 9a1fe70
Show file tree
Hide file tree
Showing 8 changed files with 13 additions and 9 deletions.
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ dependencies:
- tokenizers==0.9.4
- tqdm==4.45.0
- transformers==4.0.0
- sentencepiece==0.1.94
2 changes: 1 addition & 1 deletion pygaggle/model/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ class T5BatchTokenizer(AppendEosTokenizerMixin, QueryDocumentBatchTokenizer):
def __init__(self, *args, **kwargs):
kwargs['pattern'] = 'Query: {query} Document: {document} Relevant:'
kwargs['return_attention_mask'] = True
kwargs['padding'] = 'max_length'
kwargs['padding'] = 'longest'
kwargs["truncation"] = True
kwargs['return_tensors'] = 'pt'
kwargs['max_length'] = 512
Expand Down
4 changes: 2 additions & 2 deletions pygaggle/rerank/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def get_model(pretrained_model_name_or_path: str = 'castorini/monot5-base-msmarc
def get_tokenizer(pretrained_model_name_or_path: str = 't5-base',
*args, batch_size: int = 8, **kwargs) -> T5BatchTokenizer:
return T5BatchTokenizer(
AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *args, **kwargs),
AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast=False, *args, **kwargs),
batch_size=batch_size
)

Expand Down Expand Up @@ -132,7 +132,7 @@ def get_model(pretrained_model_name_or_path: str = 'castorini/monobert-large-msm
@staticmethod
def get_tokenizer(pretrained_model_name_or_path: str = 'bert-large-uncased',
*args, **kwargs) -> AutoTokenizer:
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast=False, *args, **kwargs)

@torch.no_grad()
def rerank(self, query: Query, texts: List[Text]) -> List[Text]:
Expand Down
2 changes: 1 addition & 1 deletion pygaggle/run/evaluate_document_ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def construct_transformer(options:
model = AutoModel.from_pretrained(options.model,
from_tf=options.from_tf).to(device).eval()
tokenizer = SimpleBatchTokenizer(AutoTokenizer.from_pretrained(
options.tokenizer_name),
options.tokenizer_name, use_fast=False,),
options.batch_size)
provider = CosineSimilarityMatrixProvider()
return UnsupervisedTransformerReranker(model, tokenizer, provider)
Expand Down
4 changes: 3 additions & 1 deletion pygaggle/run/evaluate_kaggle_highlighter.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,9 @@ def construct_qa_transformer(options: KaggleEvaluationOptions) -> Reranker:
device = torch.device(options.device)
model = model.to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(
options.tokenizer_name, do_lower_case=options.do_lower_case)
options.tokenizer_name,
do_lower_case=options.do_lower_case,
use_fast=False)
return QuestionAnsweringTransformerReranker(model, tokenizer)


Expand Down
2 changes: 1 addition & 1 deletion pygaggle/run/evaluate_passage_ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def construct_transformer(options:
model = AutoModel.from_pretrained(options.model,
from_tf=options.from_tf).to(device).eval()
tokenizer = SimpleBatchTokenizer(AutoTokenizer.from_pretrained(
options.tokenizer_name),
options.tokenizer_name, use_fast=False),
options.batch_size)
provider = CosineSimilarityMatrixProvider()
return UnsupervisedTransformerReranker(model, tokenizer, provider)
Expand Down
6 changes: 3 additions & 3 deletions pygaggle/run/evaluate_trec_covid_ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def construct_t5(options: DocumentRankingEvaluationOptions) -> Reranker:
device = torch.device(options.device)
model = T5ForConditionalGeneration.from_pretrained(options.model,
from_tf=options.from_tf).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(options.model_type)
tokenizer = AutoTokenizer.from_pretrained(options.model_type, use_fast=False)
tokenizer = T5BatchTokenizer(tokenizer, options.batch_size)
return T5Reranker(model, tokenizer)

Expand All @@ -91,7 +91,7 @@ def construct_transformer(options:
model = AutoModel.from_pretrained(options.model,
from_tf=options.from_tf).to(device).eval()
tokenizer = SimpleBatchTokenizer(AutoTokenizer.from_pretrained(
options.tokenizer_name),
options.tokenizer_name, use_fast=False),
options.batch_size)
provider = CosineSimilarityMatrixProvider()
return UnsupervisedTransformerReranker(model, tokenizer, provider)
Expand All @@ -102,7 +102,7 @@ def construct_seq_class_transformer(options: DocumentRankingEvaluationOptions
model = AutoModelForSequenceClassification.from_pretrained(options.model, from_tf=options.from_tf)
device = torch.device(options.device)
model = model.to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(options.tokenizer_name)
tokenizer = AutoTokenizer.from_pretrained(options.tokenizer_name, use_fast=False)
return SequenceClassificationTransformerReranker(model, tokenizer)


Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ tensorflow>=2.2.0rc1
tokenizers==0.9.4
tqdm==4.45.0
transformers==4.0.0
sentencepiece==0.1.94

0 comments on commit 9a1fe70

Please sign in to comment.