Skip to content

Commit

Permalink
[Fix] Shift to use Tokenizers instead of AutoTikTokenizer as the default
Browse files Browse the repository at this point in the history
  • Loading branch information
bhavnicksm committed Feb 4, 2025
1 parent 4b8f705 commit dc9fe48
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 33 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
[![PyPI version](https://img.shields.io/pypi/v/chonkie.svg)](https://pypi.org/project/chonkie/)
[![License](https://img.shields.io/github/license/bhavnicksm/chonkie.svg)](/~https://github.com/bhavnicksm/chonkie/blob/main/LICENSE)
[![Documentation](https://img.shields.io/badge/docs-chonkie.ai-blue.svg)](https://docs.chonkie.ai)
![Package size](https://img.shields.io/badge/size-11.2MB-blue)
![Package size](https://img.shields.io/badge/size-15MB-blue)
[![Downloads](https://static.pepy.tech/badge/chonkie)](https://pepy.tech/project/chonkie)
[![Discord](https://dcbadge.limes.pink/api/server/https://discord.gg/rYYp6DC4cv?style=flat)](https://discord.gg/rYYp6DC4cv)
[![GitHub stars](https://img.shields.io/github/stars/bhavnicksm/chonkie.svg)](/~https://github.com/bhavnicksm/chonkie/stargazers)
Expand Down Expand Up @@ -110,7 +110,7 @@ Here's a quick peek at how Chonkie performs:

**Size**📦

- **Default Install:** 11.2MB (vs 80-171MB for alternatives)
- **Default Install:** 15MB (vs 80-171MB for alternatives)
- **With Semantic:** Still lighter than the competition!

**Speed**
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ classifiers = [
"Topic :: Text Processing :: Linguistic"
]
dependencies = [
"autotiktokenizer>=0.2.0",
"tokenizers>=0.16.0",
"tqdm>=4.64.0"
]

Expand All @@ -46,6 +46,7 @@ openai = ["openai>=1.0.0", "numpy>=1.23.0, <2.2"]
semantic = ["model2vec>=0.3.0", "numpy>=1.23.0, <2.2"]
all = ["sentence-transformers>=3.0.0", "numpy>=1.23.0, <2.2", "openai>=1.0.0", "model2vec>=0.3.0"]
dev = [
"tiktoken>=1.40.0",
"pytest>=6.2.0",
"pytest-cov>=4.0.0",
"datasets>=1.14.0",
Expand Down
46 changes: 20 additions & 26 deletions src/chonkie/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,47 +62,41 @@ def _get_tokenizer_backend(self):
def _load_tokenizer(self, tokenizer_name: str):
"""Load a tokenizer based on the backend."""
try:
if importlib.util.find_spec("tiktoken") is not None:
from tiktoken import get_encoding
if importlib.util.find_spec("tokenizers") is not None:
from tokenizers import Tokenizer

self._tokenizer_backend = "tiktoken"
return get_encoding(tokenizer_name)
self._tokenizer_backend = "tokenizers"
return Tokenizer.from_pretrained(tokenizer_name)
else:
raise Warning("TikToken library not found. Trying autotiktokenizer.")
raise Warning("Tokenizers library not found. Trying tiktoken.")
except Exception:
try:
if importlib.util.find_spec("autotiktokenizer") is not None:
from autotiktokenizer import AutoTikTokenizer
if importlib.util.find_spec("tiktoken") is not None:
from tiktoken import get_encoding

self._tokenizer_backend = "tiktoken"
return AutoTikTokenizer.from_pretrained(tokenizer_name)
return get_encoding(tokenizer_name)
else:
raise Warning(
"AutoTikTokenizer library not found. Trying tokenizers."
"TikToken library not found. Trying transformers."
)
except Exception:
try:
if importlib.util.find_spec("tokenizers") is not None:
from tokenizers import Tokenizer
if importlib.util.find_spec("transformers") is not None:
from transformers import AutoTokenizer

self._tokenizer_backend = "tokenizers"
return Tokenizer.from_pretrained(tokenizer_name)
self._tokenizer_backend = "transformers"
return AutoTokenizer.from_pretrained(tokenizer_name)
else:
raise Warning(
"Tokenizers library not found. Trying transformers."
)
except Exception:
try:
if importlib.util.find_spec("transformers") is not None:
from transformers import AutoTokenizer

self._tokenizer_backend = "transformers"
return AutoTokenizer.from_pretrained(tokenizer_name)
except Exception:
raise ValueError(
"Tokenizer not found in the following libraries: transformers, tokenizers, autotiktokenizer, tiktoken",
"Please install one of these libraries to use the chunker.",
"Tokenizer not found in the following libraries: transformers, tokenizers, tiktoken",
"Please check your installations, or use a different tokenizer.",
)
except Exception:
raise ValueError(
"Tokenizer not found in the following libraries: transformers, tokenizers, tiktoken",
"Please check your installations, or use a different tokenizer.",
)

def _get_tokenizer_counter(self) -> Callable[[str], int]:
"""Get token counter based on tokenizer backend."""
Expand Down
8 changes: 4 additions & 4 deletions tests/chunker/test_sentence_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
from typing import List

import pytest
from autotiktokenizer import AutoTikTokenizer

from chonkie.chunker.base import Chunk
from chonkie.chunker.sentence import SentenceChunker
from chonkie import Chunk
from chonkie import SentenceChunker

from tokenizers import Tokenizer

@pytest.fixture
def tokenizer():
"""Return a tokenizer instance."""
return AutoTikTokenizer.from_pretrained("gpt2")
return Tokenizer.from_pretrained("gpt2")


@pytest.fixture
Expand Down

0 comments on commit dc9fe48

Please sign in to comment.