-
Following the example at /~https://github.com/awslabs/aws-sdk-rust/blob/main/examples/textract/src/bin/analyze-document.rs#L52. I am trying to write complete response from AWS textract for Form type document to s3. Doing something like below. let textract_result = textract_client
.analyze_document()
.document(document)
.feature_types(FeatureType::Forms)
.send()
.await?; let textract_result_output = textract_result
.blocks()
.unwrap_or_default()
.iter()
.filter_map(|block| block.text().map(ToOwned::to_owned))
.next()
.expect("found query result");
let json_string = serde_json::to_string(&textract_result_output)?;
tracing::info!("json_string {:?}", json_string);
// Create a ByteStream from the UTF-8 encoded bytes of the JSON string
let textract_result_bytes = ByteStream::from(json_string.as_bytes().to_vec());
// Create a ByteStream from the Bytes object
let _put_object_result = s3_client
.put_object()
.bucket(output_bucket)
.key(output_key)
.body(textract_result_bytes)
.send()
.await?; This code is writing only very first text of the document in the file to S3. I am trying to write complete response to a text file and save on S3. Any suggestion in this regard would be awesome. Also if there is any document which can suggest steps to parse this response in RUST the most efficient way would be awesome. Thanks |
Beta Was this translation helpful? Give feedback.
Replies: 3 comments 2 replies
-
Output gist via AWS CLI is https://gist.github.com/harssh/84e7efb43ed196a84f2ebab775fe7b19 |
Beta Was this translation helpful? Give feedback.
-
When you call let res: String = textract_result
.blocks()
.unwrap_or_default()
.iter()
.filter_map(|block| block.text().map(ToOwned::to_owned))
.collect();
tracing::info!("json_string {res:?}");
// Create a ByteStream from the UTF-8 encoded bytes of the JSON string
let textract_result_bytes = ByteStream::from(res.as_bytes().to_vec());
// Create a ByteStream from the Bytes object
let _put_object_result = s3_client
.put_object()
.bucket(output_bucket)
.key(output_key)
.body(textract_result_bytes)
.send()
.await?; Using let res: Vec<String> = textract_result
.blocks()
.unwrap_or_default()
.iter()
.filter_map(|block| block.text().map(ToOwned::to_owned))
.collect();
let res = res.join(',');
tracing::info!("json_string {res:?}");
// Create a ByteStream from the UTF-8 encoded bytes of the JSON string
let textract_result_bytes = ByteStream::from(res.as_bytes().to_vec());
// Create a ByteStream from the Bytes object
let _put_object_result = s3_client
.put_object()
.bucket(output_bucket)
.key(output_key)
.body(textract_result_bytes)
.send()
.await?; However, the SDKs don't currently support |
Beta Was this translation helpful? Give feedback.
-
Created below struct following https://ectobit.com/blog/parsing-json-in-rust/ If I could iterate through the response I think I can get some proper JSON output. pub type TexResponse = Vec<TexAnalyzeDocumentOutput>;
pub type BlockResponse = Vec<Block>;
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct TexAnalyzeDocumentOutput {
#[serde(rename = "DocumentMetadata")]
pub document_metadata: DocumentMetadata,
#[serde(rename = "Blocks")]
pub blocks: Vec<Block>,
#[serde(rename = "AnalyzeDocumentModelVersion")]
pub analyze_document_model_version: String,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct DocumentMetadata {
#[serde(rename = "Pages")]
pub pages: i64,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Block {
#[serde(rename = "BlockType")]
pub block_type: String,
#[serde(rename = "Geometry")]
pub geometry: Option<Geometry>,
#[serde(rename = "Id")]
pub id: String,
#[serde(rename = "Relationships")]
#[serde(default)]
pub relationships: Vec<Relationship>,
#[serde(rename = "Confidence")]
pub confidence: Option<f64>,
#[serde(rename = "Text")]
pub text: Option<String>,
#[serde(rename = "TextType")]
pub text_type: Option<String>,
#[serde(rename = "Query")]
pub query: Option<TextractQuery>,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Geometry {
#[serde(rename = "BoundingBox")]
pub bounding_box: BoundingBox,
#[serde(rename = "Polygon")]
pub polygon: Vec<Polygon>,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct BoundingBox {
#[serde(rename = "Width")]
pub width: f64,
#[serde(rename = "Height")]
pub height: f64,
#[serde(rename = "Left")]
pub left: f64,
#[serde(rename = "Top")]
pub top: f64,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Polygon {
#[serde(rename = "X")]
pub x: f64,
#[serde(rename = "Y")]
pub y: f64,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Relationship {
#[serde(rename = "Type")]
pub type_field: String,
#[serde(rename = "Ids")]
pub ids: Vec<String>,
}
#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct TextractQuery {
#[serde(rename = "Text")]
pub text: String,
#[serde(rename = "Alias")]
pub alias: String,
} |
Beta Was this translation helpful? Give feedback.
Created below struct following https://ectobit.com/blog/parsing-json-in-rust/
If I could iterate through the response I think I can get some proper JSON output.