From 02b0ba5af56ae4ebff61bbcc2da33445949cdfdf Mon Sep 17 00:00:00 2001 From: lqb11 Date: Thu, 9 Apr 2020 12:10:25 +0800 Subject: [PATCH] Tokenizer supports string split with multiple spaces. See #69 --- .../com/alibaba/alink/operator/common/nlp/TokenizerMapper.java | 2 +- .../alibaba/alink/operator/common/nlp/TokenizerMapperTest.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/com/alibaba/alink/operator/common/nlp/TokenizerMapper.java b/core/src/main/java/com/alibaba/alink/operator/common/nlp/TokenizerMapper.java index febbdefbc..97b4831f8 100644 --- a/core/src/main/java/com/alibaba/alink/operator/common/nlp/TokenizerMapper.java +++ b/core/src/main/java/com/alibaba/alink/operator/common/nlp/TokenizerMapper.java @@ -11,7 +11,7 @@ * Transform all words into lower case, and split it by white space. */ public class TokenizerMapper extends SISOMapper { - private static final String SPLIT_DELIMITER = "\\s"; + private static final String SPLIT_DELIMITER = "\\s+"; public TokenizerMapper(TableSchema dataSchema, Params params) { super(dataSchema, params); diff --git a/core/src/test/java/com/alibaba/alink/operator/common/nlp/TokenizerMapperTest.java b/core/src/test/java/com/alibaba/alink/operator/common/nlp/TokenizerMapperTest.java index cce3e1f57..44b7754fb 100644 --- a/core/src/test/java/com/alibaba/alink/operator/common/nlp/TokenizerMapperTest.java +++ b/core/src/test/java/com/alibaba/alink/operator/common/nlp/TokenizerMapperTest.java @@ -24,7 +24,7 @@ public void testDefault() throws Exception { TokenizerMapper mapper = new TokenizerMapper(schema, params); assertEquals(mapper.map(Row.of("This\tis a unit test for mapper")).getField(0), - "this is a unit test for mapper"); + "this is a unit test for mapper"); assertEquals(mapper.getOutputSchema(), schema); }