-
Notifications
You must be signed in to change notification settings - Fork 154
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve naming: JSON shards are actually JSONL, etc. (#537)
* Stdize docstrings, also fix ordering of get_sample_data, decode_sample. * Terminology: "joint" -> "mono". * "split" -> "dual" to stop confusing people (SplitWriter != dataaset splits) * "Reader" -> "Shard". They manage shards. They do more than read. * Fix filenames accordingly. * Finally, JSON -> JSONL. * Switch order of decorators... * Fix markdown code.
- Loading branch information
Showing
22 changed files
with
301 additions
and
263 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,54 @@ | ||
# Copyright 2022-2024 MosaicML Streaming authors | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
"""Individual dataset writer for every format.""" | ||
"""Streaming serialization format, consisting of an index and multiple types of shards.""" | ||
|
||
from typing import Any, Dict, Optional | ||
|
||
from streaming.format.index import get_index_basename | ||
from streaming.format.json import JSONReader, JSONWriter | ||
from streaming.format.mds import MDSReader, MDSWriter | ||
from streaming.format.reader import FileInfo, Reader | ||
from streaming.format.xsv import CSVReader, CSVWriter, TSVReader, TSVWriter, XSVReader, XSVWriter | ||
from streaming.format.jsonl import JSONLShard, JSONLWriter | ||
from streaming.format.mds import MDSShard, MDSWriter | ||
from streaming.format.shard import FileInfo, Shard | ||
from streaming.format.xsv import CSVShard, CSVWriter, TSVShard, TSVWriter, XSVShard, XSVWriter | ||
|
||
__all__ = [ | ||
'CSVWriter', 'FileInfo', 'get_index_basename', 'JSONWriter', 'MDSWriter', 'Reader', | ||
'reader_from_json', 'TSVWriter', 'XSVWriter' | ||
'CSVWriter', 'FileInfo', 'get_index_basename', 'JSONLWriter', 'MDSWriter', 'Shard', | ||
'shard_from_json', 'TSVWriter', 'XSVWriter' | ||
] | ||
|
||
_readers = { | ||
'csv': CSVReader, | ||
'json': JSONReader, | ||
'mds': MDSReader, | ||
'tsv': TSVReader, | ||
'xsv': XSVReader | ||
# Mapping of shard metadata dict "format" field to what type of Shard it is. | ||
_shards = { | ||
'csv': CSVShard, | ||
'jsonl': JSONLShard, | ||
'mds': MDSShard, | ||
'tsv': TSVShard, | ||
'xsv': XSVShard, | ||
} | ||
|
||
|
||
def reader_from_json(dirname: str, split: Optional[str], obj: Dict[str, Any]) -> Reader: | ||
"""Initialize the reader from JSON object. | ||
def _get_shard_class(format_name: str) -> Shard: | ||
"""Get the associated Shard class given a Shard format name. | ||
Args: | ||
format_name (str): Shard format name. | ||
""" | ||
# JSONL shards were originally called JSON shards (while containing JSONL). | ||
if format_name == 'json': | ||
format_name = 'jsonl' | ||
return _shards[format_name] | ||
|
||
|
||
def shard_from_json(dirname: str, split: Optional[str], obj: Dict[str, Any]) -> Shard: | ||
"""Create a shard from a JSON config. | ||
Args: | ||
dirname (str): Local directory containing shards. | ||
split (str, optional): Which dataset split to use, if any. | ||
obj (Dict[str, Any]): JSON object to load. | ||
Returns: | ||
Reader: Loaded Reader of `format` type | ||
Shard: The loaded Shard. | ||
""" | ||
assert obj['version'] == 2 | ||
cls = _readers[obj['format']] | ||
cls = _get_shard_class(obj['format']) | ||
return cls.from_json(dirname, split, obj) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# Copyright 2023 MosaicML Streaming authors | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
"""Streaming JSONL shards.""" | ||
|
||
from streaming.format.jsonl.shard import JSONLShard | ||
from streaming.format.jsonl.writer import JSONLWriter | ||
|
||
__all__ = ['JSONLShard', 'JSONLWriter'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,9 @@ | ||
# Copyright 2022-2024 MosaicML Streaming authors | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
"""Module to write and read the dataset in MDS format.""" | ||
"""MDS shards.""" | ||
|
||
from streaming.format.mds.reader import MDSReader | ||
from streaming.format.mds.shard import MDSShard | ||
from streaming.format.mds.writer import MDSWriter | ||
|
||
__all__ = ['MDSReader', 'MDSWriter'] | ||
__all__ = ['MDSShard', 'MDSWriter'] |
Oops, something went wrong.