Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Fixed empty reader panic for NDJSON type infer #974

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/io/json/read/infer_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,18 @@ fn infer_number(n: &serde_json::Number) -> DataType {
}

/// Coerce an heterogeneous set of [`DataType`] into a single one. Rules:
/// * The empty set is coerced to `Null`
/// * `Int64` and `Float64` are `Float64`
/// * Lists and scalars are coerced to a list of a compatible scalar
/// * Structs contain the union of all fields
/// * All other types are coerced to `Utf8`
pub(crate) fn coerce_data_type<A: Borrow<DataType>>(datatypes: &[A]) -> DataType {
use DataType::*;

if datatypes.is_empty() {
return DataType::Null;
}

let are_all_equal = datatypes.windows(2).all(|w| w[0].borrow() == w[1].borrow());

if are_all_equal {
Expand Down Expand Up @@ -186,4 +191,15 @@ mod test {
List(Box::new(Field::new(ITEM_NAME, Utf8, true))),
);
}

#[test]
fn test_coersion_of_nulls() {
assert_eq!(coerce_data_type(&[DataType::Null]), DataType::Null);
assert_eq!(
coerce_data_type(&[DataType::Null, DataType::Boolean]),
DataType::Utf8
);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this intended? Should it not be just DataType::Boolean?

Similarly, {"a": null}" is currently infered as DataType::Struct(vec![])and not DataType::Struct(vec![Field::new("a", DataType::Null, true)])

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good observation. I think it is intended - infer_array does not pass DataType::Null to coerce_data_type since arrow arrays are nullable and thus we handle Null as part before passing to coerce_data_type. The idea is here.

With that said, imo it is not the cleanest design and maybe we should clean it up?

let vec: Vec<DataType> = vec![];
assert_eq!(coerce_data_type(vec.as_slice()), DataType::Null);
}
}
7 changes: 7 additions & 0 deletions src/io/ndjson/read/deserialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ use super::super::super::json::read::_deserialize;
/// # Errors
/// This function errors iff any of the rows is not a valid JSON (i.e. the format is not valid NDJSON).
pub fn deserialize(rows: &[String], data_type: DataType) -> Result<Arc<dyn Array>, ArrowError> {
if rows.is_empty() {
return Err(ArrowError::ExternalFormat(
"Cannot deserialize 0 NDJSON rows because empty string is not a valid JSON value"
.to_string(),
));
}

// deserialize strings to `Value`s
let rows = rows
.iter()
Expand Down
8 changes: 7 additions & 1 deletion src/io/ndjson/read/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,19 @@ impl<R: BufRead> FallibleStreamingIterator for FileReader<R> {

/// Infers the [`DataType`] from an NDJSON file, optionally only using `number_of_rows` rows.
///
/// # Implementantion
/// # Implementation
/// This implementation reads the file line by line and infers the type of each line.
/// It performs both `O(N)` IO and CPU-bounded operations where `N` is the number of rows.
pub fn infer<R: std::io::BufRead>(
reader: &mut R,
number_of_rows: Option<usize>,
) -> Result<DataType> {
if reader.fill_buf().map(|b| b.is_empty())? {
return Err(ArrowError::ExternalFormat(
"Cannot infer NDJSON types on empty reader because empty string is not a valid JSON value".to_string(),
));
}

let rows = vec!["".to_string(); 1]; // 1 <=> read row by row
let mut reader = FileReader::new(reader, rows, number_of_rows);

Expand Down
31 changes: 30 additions & 1 deletion tests/it/io/ndjson/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::sync::Arc;

use arrow2::array::*;
use arrow2::datatypes::{DataType, Field};
use arrow2::error::Result;
use arrow2::error::{ArrowError, Result};
use arrow2::io::ndjson::read as ndjson_read;
use arrow2::io::ndjson::read::FallibleStreamingIterator;

Expand Down Expand Up @@ -57,6 +57,35 @@ fn infer_nullable() -> Result<()> {
Ok(())
}

#[test]
fn read_null() -> Result<()> {
let ndjson = "null";
let expected_data_type = DataType::Null;

let data_type = infer(ndjson)?;
assert_eq!(expected_data_type, data_type);

let arrays = read_and_deserialize(ndjson, &data_type, 1000)?;
let expected = NullArray::new(data_type, 1);
assert_eq!(expected, arrays[0].as_ref());
Ok(())
}

#[test]
fn read_empty_reader() -> Result<()> {
let ndjson = "";

let infer_error = infer(ndjson);
assert!(matches!(infer_error, Err(ArrowError::ExternalFormat(_))));

let deserialize_error = ndjson_read::deserialize(&[], DataType::Null);
assert!(matches!(
deserialize_error,
Err(ArrowError::ExternalFormat(_))
));
Ok(())
}

fn case_nested_struct() -> (String, Arc<dyn Array>) {
let ndjson = r#"{"a": {"a": 2.0, "b": 2}}
{"a": {"b": 2}}
Expand Down