Skip to content

Commit

Permalink
Tokenizer supports string split with multiple spaces.
Browse files Browse the repository at this point in the history
See #69
  • Loading branch information
lqb11 authored and shaomeng.wang committed Apr 9, 2020
1 parent 0f9227f commit 02b0ba5
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* Transform all words into lower case, and split it by white space.
*/
public class TokenizerMapper extends SISOMapper {
private static final String SPLIT_DELIMITER = "\\s";
private static final String SPLIT_DELIMITER = "\\s+";

public TokenizerMapper(TableSchema dataSchema, Params params) {
super(dataSchema, params);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public void testDefault() throws Exception {
TokenizerMapper mapper = new TokenizerMapper(schema, params);

assertEquals(mapper.map(Row.of("This\tis a unit test for mapper")).getField(0),
"this is a unit test for mapper");
"this is a unit test for mapper");
assertEquals(mapper.getOutputSchema(), schema);
}

Expand Down

0 comments on commit 02b0ba5

Please sign in to comment.