diff --git a/.gitignore b/.gitignore index 89a455a889..f182c5376f 100644 --- a/.gitignore +++ b/.gitignore @@ -69,3 +69,4 @@ grobid-home/models/dictionaries* grobid-home/models/software* grobid-home/models/superconductors* grobid-home/models/values +grobid-home/models/dataseer \ No newline at end of file diff --git a/doc/Consolidation.md b/doc/Consolidation.md index e7b6440714..c345169bff 100644 --- a/doc/Consolidation.md +++ b/doc/Consolidation.md @@ -4,7 +4,7 @@ In GROBID, we call __consolidation__ the usage of an external bibliographical se Consolidation has two main interests: -* The consolidation service improves very significantly the retrieval of header information (+.12 to .13 in f-score, e.g. from 74.59 f-score in average for all fields with Ratcliff/Obershelp similarity at 0.95, to 86.62 f-score, using biblio-glutton and GROBID version 0.5.5 for the PMC 1942 dataset, see the [benchmarking documentation](https://grobid.readthedocs.io/en/latest/End-to-end-evaluation/) and [reports](/~https://github.com/kermitt2/grobid/tree/master/grobid-trainer/doc)). +* The consolidation service improves very significantly the retrieval of header information (+.12 to .13 in f-score, e.g. from 74.59 f-score in average for all fields with Ratcliff/Obershelp similarity at 0.95, to 88.89 f-score, using biblio-glutton and GROBID version 0.5.6-SNAPSHOT for the PMC 1942 dataset, see the [benchmarking documentation](https://grobid.readthedocs.io/en/latest/End-to-end-evaluation/) and [reports](/~https://github.com/kermitt2/grobid/tree/master/grobid-trainer/doc)). * The consolidation service matches the extracted bibliographical references with known publications, and complement the parsed bibliographical references with various metadata, in particular DOI, making possible the creation of a citation graph and to link the extracted references to external services. diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index a846ae4d6a..d5f197ad47 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -988,7 +988,7 @@ private StringBuilder toTEINote(String noteType, StringBuilder tei, Document doc, GrobidAnalysisConfig config) throws Exception { - List allNotes = new ArrayList(); + List allNotes = new ArrayList<>(); for (DocumentPiece docPiece : documentNoteParts) { List noteTokens = doc.getDocumentPieceTokenization(docPiece); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 562b99f9d0..690f8047c7 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -6,6 +6,8 @@ import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.io.FileUtils; +import java.nio.charset.StandardCharsets; + import org.grobid.core.GrobidModels; import org.grobid.core.data.BibDataSet; import org.grobid.core.data.BiblioItem; @@ -136,22 +138,51 @@ public Document processing(DocumentSource documentSource, BiblioItem resHeader = new BiblioItem(); Pair featSeg = null; if (GrobidProperties.isHeaderUseHeuristics()) { + // heuristics for identifying the header zone, this is the old version of the header block identification, + // still used because more robust than the pure machine learning approach (lack of training data) parsers.getHeaderParser().processingHeaderBlock(config.getConsolidateHeader(), doc, resHeader); } - // above the old version of the header block identification, because more robust - if ((resHeader.getTitle() == null) || (resHeader.getTitle().trim().length() == 0) || - (resHeader.getAuthors() == null) || (resHeader.getFullAuthors() == null) || - (resHeader.getFullAuthors().size() == 0) ) { + + if (isBlank(resHeader.getTitle()) || isBlank(resHeader.getAuthors()) || CollectionUtils.isEmpty(resHeader.getFullAuthors())) { resHeader = new BiblioItem(); + // using the segmentation model to identify the header zones parsers.getHeaderParser().processingHeaderSection(config.getConsolidateHeader(), doc, resHeader); - // above, use the segmentation model result - if (doc.getMetadata() != null) { - Metadata metadata = doc.getMetadata(); - if (metadata.getTitle() != null) - resHeader.setTitle(metadata.getTitle()); - if (metadata.getAuthor() != null) { + } else { + // if the heuristics method was initially used, we anyway take the abstract derived from the segementation + // model, because this structure is significantly more reliable with this approach + BiblioItem resHeader2 = new BiblioItem(); + parsers.getHeaderParser().processingHeaderSection(config.getConsolidateHeader(), doc, resHeader2); + if (isNotBlank(resHeader2.getAbstract())) { + resHeader.setAbstract(resHeader2.getAbstract()); + resHeader.setLayoutTokensForLabel(resHeader2.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT), TaggingLabels.HEADER_ABSTRACT); + } + } + + // The commented part below makes use of the PDF embedded metadata (the so-called XMP) if available + // as fall back to set author and title if they have not been found. + // However tests on PMC set 1942 did not improve recognition. This will have to be re-evaluated with + // another, more diverse, testing set and with further updates of the header model. + // DO NOT DELETE ! + /*if (isBlank(resHeader.getTitle()) || isBlank(resHeader.getAuthors()) || CollectionUtils.isEmpty(resHeader.getFullAuthors())) { + // try to exploit PDF embedded metadata (the so-called XMP) if we are still without title/authors + // this is risky as those metadata are highly unreliable, but as last chance, why not :) + Metadata metadata = doc.getMetadata(); + if (metadata != null) { + boolean titleUpdated = false; + boolean authorsUpdated = false; + + if (isNotBlank(metadata.getTitle()) && isBlank(resHeader.getTitle())) { + if (!endsWithAny(lowerCase(metadata.getTitle()), ".doc", ".pdf", ".tex", ".dvi", ".docx", ".odf", ".odt", ".txt")) { + resHeader.setTitle(metadata.getTitle()); + titleUpdated = true; + } + } + + if (isNotBlank(metadata.getAuthor()) + && (isBlank(resHeader.getAuthors()) || CollectionUtils.isEmpty(resHeader.getFullAuthors()))) { resHeader.setAuthors(metadata.getAuthor()); resHeader.setOriginalAuthors(metadata.getAuthor()); + authorsUpdated = true; List localAuthors = parsers.getAuthorParser().processingHeader(metadata.getAuthor()); if (localAuthors != null) { for (Person pers : localAuthors) { @@ -159,11 +190,14 @@ public Document processing(DocumentSource documentSource, } } } - if ( (metadata.getTitle() != null) && (metadata.getAuthor() != null) ) { + + // if title and author have been updated with embedded PDF metadata, we try to consolidate + // again as required + if ( titleUpdated || authorsUpdated ) { parsers.getHeaderParser().consolidateHeader(resHeader, config.getConsolidateHeader()); } } - } + }*/ // structure the abstract using the fulltext model if (isNotBlank(resHeader.getAbstract())) { @@ -183,7 +217,7 @@ public Document processing(DocumentSource documentSource, // citation processing // consolidation, if selected, is not done individually for each citation but - // in a second stage for all citations + // in a second stage for all citations which is much faster List resCitations = parsers.getCitationParser(). processingReferenceSection(doc, parsers.getReferenceSegmenterParser(), 0); @@ -209,8 +243,6 @@ else if (config.getConsolidateCitations() == 2) "An exception occured while running consolidation on bibliographical references.", e); } } - //if (resCitations.size() == 0) - // System.out.println("!!!!!! article without citations !!!!"); doc.setBibDataSets(resCitations); // full text processing @@ -587,7 +619,7 @@ else if (nbNumbType > (bibDataSets.size() / 2)) if (tokens == null) { continue; } -//System.out.println("we have " + tokens.size() + " tokens in the block " + blockIndex); + int n = 0;// token position in current block if (blockIndex == dp1.getBlockPtr()) { // n = dp1.getTokenDocPos() - block.getStartToken(); @@ -597,7 +629,6 @@ else if (nbNumbType > (bibDataSets.size() / 2)) // if it's a last block from a document piece, it may end earlier if (blockIndex == dp2.getBlockPtr()) { lastPos = dp2.getTokenBlockPos()+1; -//System.out.println("lastPos: " + lastPos + " / " + tokens.size()); if (lastPos > tokens.size()) { LOGGER.error("DocumentPointer for block " + blockIndex + " points to " + dp2.getTokenBlockPos() + " token, but block token size is " + @@ -605,11 +636,10 @@ else if (nbNumbType > (bibDataSets.size() / 2)) lastPos = tokens.size(); } } -//System.out.println("n/lastPos: " + n + " / " + lastPos); + while (n < lastPos) { if (blockIndex == dp2.getBlockPtr()) { //if (n > block.getEndToken()) { -//System.out.println("n: " + n + " / dp2.getTokenDocPos() - block.getStartToken() " + (dp2.getTokenDocPos() - block.getStartToken())); if (n > dp2.getTokenDocPos() - block.getStartToken()) { break; } @@ -639,7 +669,6 @@ else if (nbNumbType > (bibDataSets.size() / 2)) continue; } - //if (text.equals("\n") || text.equals("\r") ) { if (text.equals("\n")) { newline = true; previousNewline = true; @@ -675,7 +704,7 @@ else if (lineStartX - previousLineStartX > characterWidth) } } } -//System.out.println(text + "\t" + token.getX() + "\t" + lineStartX + "\t" + indented); + features.string = text; if (graphicBitmap) { @@ -991,7 +1020,7 @@ public Document createTraining(File inputFile, // we write first the full text untagged (but featurized with segmentation features) String outPathFulltext = pathFullText + File.separator + pdfFileName.replace(".pdf", ".training.segmentation"); - Writer writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFulltext), false), "UTF-8"); + Writer writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFulltext), false), StandardCharsets.UTF_8); writer.write(fulltext + "\n"); writer.close(); @@ -1002,7 +1031,7 @@ public Document createTraining(File inputFile, } String outPathRawtext = pathFullText + File.separator + pdfFileName.replace(".pdf", ".training.segmentation.rawtxt"); - FileUtils.writeStringToFile(new File(outPathRawtext), rawtxt.toString(), "UTF-8"); + FileUtils.writeStringToFile(new File(outPathRawtext), rawtxt.toString(), StandardCharsets.UTF_8); if (isNotBlank(fulltext)) { String rese = parsers.getSegmentationParser().label(fulltext); @@ -1011,7 +1040,7 @@ public Document createTraining(File inputFile, // write the TEI file to reflect the extact layout of the text as extracted from the pdf writer = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator + - pdfFileName.replace(".pdf", ".training.segmentation.tei.xml")), false), "UTF-8"); + pdfFileName.replace(".pdf", ".training.segmentation.tei.xml")), false), StandardCharsets.UTF_8); writer.write("\n\n\t\n\t\t\n\t\n\t\n"); @@ -1034,13 +1063,13 @@ public Document createTraining(File inputFile, if (tei != null) { String outPath = pathTEI + "/" + pdfFileName.replace(".pdf", ".training.references.referenceSegmenter.tei.xml"); - writer = new OutputStreamWriter(new FileOutputStream(new File(outPath), false), "UTF-8"); + writer = new OutputStreamWriter(new FileOutputStream(new File(outPath), false), StandardCharsets.UTF_8); writer.write(tei + "\n"); writer.close(); // generate also the raw vector file with the features outPath = pathTEI + "/" + pdfFileName.replace(".pdf", ".training.references.referenceSegmenter"); - writer = new OutputStreamWriter(new FileOutputStream(new File(outPath), false), "UTF-8"); + writer = new OutputStreamWriter(new FileOutputStream(new File(outPath), false), StandardCharsets.UTF_8); writer.write(raw + "\n"); writer.close(); @@ -1048,7 +1077,7 @@ public Document createTraining(File inputFile, outPathRawtext = pathTEI + "/" + pdfFileName .replace(".pdf", ".training.references.referenceSegmenter.rawtxt"); Writer strWriter = new OutputStreamWriter( - new FileOutputStream(new File(outPathRawtext), false), "UTF-8"); + new FileOutputStream(new File(outPathRawtext), false), StandardCharsets.UTF_8); strWriter.write(referencesStr + "\n"); strWriter.close(); } @@ -1080,7 +1109,7 @@ public Document createTraining(File inputFile, Writer writerReference = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator + - pdfFileName.replace(".pdf", ".training.references.tei.xml")), false), "UTF-8"); + pdfFileName.replace(".pdf", ".training.references.tei.xml")), false), StandardCharsets.UTF_8); writerReference.write("\n\n\n\n\t\n\t\n"); } @@ -1177,13 +1206,13 @@ public Document createTraining(File inputFile, if (trainingFigure.getLeft().trim().length() > 0) { String outPathFigures = pathFullText + File.separator + pdfFileName.replace(".pdf", ".training.figure"); - writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFigures), false), "UTF-8"); + writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFigures), false), StandardCharsets.UTF_8); writer.write(trainingFigure.getRight() + "\n\n"); writer.close(); String outPathFiguresTEI = pathTEI + File.separator + pdfFileName.replace(".pdf", ".training.figure.tei.xml"); - writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFiguresTEI), false), "UTF-8"); + writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFiguresTEI), false), StandardCharsets.UTF_8); writer.write(trainingFigure.getLeft() + "\n"); writer.close(); } @@ -1193,13 +1222,13 @@ public Document createTraining(File inputFile, if (trainingTable.getLeft().trim().length() > 0) { String outPathTables = pathFullText + File.separator + pdfFileName.replace(".pdf", ".training.table"); - writer = new OutputStreamWriter(new FileOutputStream(new File(outPathTables), false), "UTF-8"); + writer = new OutputStreamWriter(new FileOutputStream(new File(outPathTables), false), StandardCharsets.UTF_8); writer.write(trainingTable.getRight() + "\n\n"); writer.close(); String outPathTablesTEI = pathTEI + File.separator + pdfFileName.replace(".pdf", ".training.table.tei.xml"); - writer = new OutputStreamWriter(new FileOutputStream(new File(outPathTablesTEI), false), "UTF-8"); + writer = new OutputStreamWriter(new FileOutputStream(new File(outPathTablesTEI), false), StandardCharsets.UTF_8); writer.write(trainingTable.getLeft() + "\n"); writer.close(); } @@ -1335,7 +1364,7 @@ public Document createTraining(File inputFile, // write the training TEI file for header which reflects the extract layout of the text as // extracted from the pdf writer = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator - + pdfFileName.replace(".pdf", ".training.header.tei.xml")), false), "UTF-8"); + + pdfFileName.replace(".pdf", ".training.header.tei.xml")), false), StandardCharsets.UTF_8); writer.write("\n\n\t\n\t\t\n\t\n\t 0) { Writer writerAffiliation = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator - + pdfFileName.replace(".pdf", ".training.header.affiliation.tei.xml")), false), "UTF-8"); + + pdfFileName.replace(".pdf", ".training.header.affiliation.tei.xml")), false), StandardCharsets.UTF_8); writerAffiliation.write(""); writerAffiliation.write("\n"); @@ -1375,7 +1404,7 @@ public Document createTraining(File inputFile, if (bufferDate.length() > 0) { Writer writerDate = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator - + pdfFileName.replace(".pdf", ".training.header.date.xml")), false), "UTF-8"); + + pdfFileName.replace(".pdf", ".training.header.date.xml")), false), StandardCharsets.UTF_8); writerDate.write("\n"); writerDate.write("\n"); @@ -1391,7 +1420,7 @@ public Document createTraining(File inputFile, if (bufferName.length() > 0) { Writer writerName = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator - + pdfFileName.replace(".pdf", ".training.header.authors.tei.xml")), false), "UTF-8"); + + pdfFileName.replace(".pdf", ".training.header.authors.tei.xml")), false), StandardCharsets.UTF_8); writerName.write(""); writerName.write("\n"); @@ -1414,7 +1443,7 @@ public Document createTraining(File inputFile, if (bufferReference.length() > 0) { Writer writerReference = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator - + pdfFileName.replace(".pdf", ".training.header.reference.xml")), false), "UTF-8"); + + pdfFileName.replace(".pdf", ".training.header.reference.xml")), false), StandardCharsets.UTF_8); writerReference.write("\n"); writerReference.write("\n"); diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java index 0076a5b6e4..6694c6972a 100755 --- a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java +++ b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java @@ -333,8 +333,8 @@ public Response processCitationPatentPDF(@FormDataParam(INPUT) InputStream pInpu @Produces(MediaType.APPLICATION_XML) @POST public Response processCitationPatentTXT_post(@FormParam(INPUT) String text, - @FormParam("consolidateCitations") String consolidate, - @FormDataParam("includeRawCitations") String includeRawCitations) { + @DefaultValue("0") @FormParam("consolidateCitations") String consolidate, + @DefaultValue("0") @FormParam("includeRawCitations") String includeRawCitations) { int consol = validateConsolidationParam(consolidate); boolean includeRaw = validateIncludeRawParam(includeRawCitations); return restProcessString.processCitationPatentTXT(text, consol, includeRaw); diff --git a/grobid-trainer/doc/PMC_sample_1943.results.grobid-0.5.6-SNAPSHOT-Glutton-13.09.2019 b/grobid-trainer/doc/PMC_sample_1943.results.grobid-0.5.6-SNAPSHOT-Glutton-13.09.2019 new file mode 100644 index 0000000000..9af2845be4 --- /dev/null +++ b/grobid-trainer/doc/PMC_sample_1943.results.grobid-0.5.6-SNAPSHOT-Glutton-13.09.2019 @@ -0,0 +1,341 @@ +Evaluation metrics produced in 835.197 seconds + +======= Header metadata ======= + +Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 81.91 14.86 13.97 14.4 1911 +authors 98.1 91.41 91.04 91.22 1941 +first_author 99.04 96.21 95.47 95.84 1941 +keywords 92.81 65.56 53.12 58.69 1380 +title 96.87 85.46 85.28 85.37 1943 + +all (micro avg.) 93.74 72.04 68.86 70.41 9116 +all (macro avg.) 93.74 70.7 67.77 69.1 9116 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 89.27 52.25 49.14 50.65 1911 +authors 98.16 91.72 91.34 91.53 1941 +first_author 99.06 96.31 95.57 95.94 1941 +keywords 94.32 77.91 63.12 69.74 1380 +title 98.46 92.93 92.74 92.84 1943 + +all (micro avg.) 95.85 83.09 79.42 81.22 9116 +all (macro avg.) 95.85 82.23 78.38 80.14 9116 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 96.38 88.37 83.1 85.65 1911 +authors 99.05 95.91 95.52 95.72 1941 +first_author 99.12 96.63 95.88 96.25 1941 +keywords 95.65 88.82 71.96 79.5 1380 +title 99.11 96.03 95.83 95.93 1943 + +all (micro avg.) 97.86 93.63 89.49 91.51 9116 +all (macro avg.) 97.86 93.15 88.46 90.61 9116 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 95.05 81.64 76.77 79.13 1911 +authors 98.58 93.69 93.3 93.5 1941 +first_author 99.04 96.21 95.47 95.84 1941 +keywords 95.08 84.17 68.19 75.34 1380 +title 99 95.51 95.32 95.41 1943 + +all (micro avg.) 97.35 90.94 86.92 88.89 9116 +all (macro avg.) 97.35 90.24 85.81 87.84 9116 + +===== Instance-level results ===== + +Total expected instances: 1943 +Total correct instances: 177 (strict) +Total correct instances: 684 (soft) +Total correct instances: 1264 (Levenshtein) +Total correct instances: 1124 (ObservedRatcliffObershelp) + +Instance-level recall: 9.11 (strict) +Instance-level recall: 35.2 (soft) +Instance-level recall: 65.05 (Levenshtein) +Instance-level recall: 57.85 (RatcliffObershelp) + +======= Citation metadata ======= + +Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 97.54 83.03 74.03 78.27 85778 +date 98.95 92.98 81.6 86.92 87067 +first_author 98.53 90.22 80.42 85.04 85778 +id 98.99 0 0 0 0 +inTitle 95.99 71.75 69.7 70.71 81007 +issue 99.58 89.34 82.14 85.59 16635 +page 98.54 93.06 82.22 87.3 80501 +title 97.02 78.5 72.14 75.19 80736 +volume 99.22 95.08 87.33 91.04 80067 + +all (micro avg.) 98.17 86.32 78.32 82.12 597569 +all (macro avg.) 98.17 86.74 78.7 82.51 597569 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 97.63 83.63 74.57 78.84 85778 +date 98.95 92.98 81.6 86.92 87067 +first_author 98.56 90.4 80.58 85.21 85778 +id 98.99 0 0 0 0 +inTitle 97.62 83.21 80.83 82.01 81007 +issue 99.58 89.34 82.14 85.59 16635 +page 98.54 93.06 82.22 87.3 80501 +title 98.52 89.78 82.5 85.99 80736 +volume 99.22 95.08 87.33 91.04 80067 + +all (micro avg.) 98.58 89.64 81.33 85.28 597569 +all (macro avg.) 98.58 89.69 81.47 85.36 597569 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 98.37 88.99 79.35 83.89 85778 +date 98.95 92.98 81.6 86.92 87067 +first_author 98.58 90.52 80.69 85.33 85778 +id 98.99 0 0 0 0 +inTitle 97.75 84.16 81.75 82.94 81007 +issue 99.58 89.34 82.14 85.59 16635 +page 98.54 93.06 82.22 87.3 80501 +title 98.93 92.85 85.32 88.92 80736 +volume 99.22 95.08 87.33 91.04 80067 + +all (micro avg.) 98.74 90.96 82.54 86.55 597569 +all (macro avg.) 98.74 90.87 82.55 86.49 597569 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 97.93 85.83 76.52 80.91 85778 +date 98.95 92.98 81.6 86.92 87067 +first_author 98.54 90.24 80.44 85.06 85778 +id 98.99 0 0 0 0 +inTitle 97.42 81.79 79.45 80.6 81007 +issue 99.58 89.34 82.14 85.59 16635 +page 98.54 93.06 82.22 87.3 80501 +title 98.81 91.89 84.43 88 80736 +volume 99.22 95.08 87.33 91.04 80067 + +all (micro avg.) 98.62 90 81.66 85.63 597569 +all (macro avg.) 98.62 90.02 81.77 85.68 597569 + +===== Instance-level results ===== + +Total expected instances: 90125 +Total extracted instances: 89712 +Total correct instances: 37566 (strict) +Total correct instances: 48862 (soft) +Total correct instances: 53306 (Levenshtein) +Total correct instances: 50086 (RatcliffObershelp) + +Instance-level precision: 41.87 (strict) +Instance-level precision: 54.47 (soft) +Instance-level precision: 59.42 (Levenshtein) +Instance-level precision: 55.83 (RatcliffObershelp) + +Instance-level recall: 41.68 (strict) +Instance-level recall: 54.22 (soft) +Instance-level recall: 59.15 (Levenshtein) +Instance-level recall: 55.57 (RatcliffObershelp) + +Instance-level f-score: 41.78 (strict) +Instance-level f-score: 54.34 (soft) +Instance-level f-score: 59.28 (Levenshtein) +Instance-level f-score: 55.7 (RatcliffObershelp) + +Matching 1 : 65059 + +Matching 2 : 4630 + +Matching 3 : 2718 + +Matching 4 : 709 + +Total matches : 73116 + +======= Citation context resolution ======= + +Total expected references: 90125 - 46.38 references per article +Total predicted references: 89712 - 46.17 references per article + +Total expected citation contexts: 139835 - 71.97 citation contexts per article +Total predicted citation contexts: 117980 - 60.72 citation contexts per article + +Total correct predicted citation contexts: 95984 - 49.4 citation contexts per article +Total wrong predicted citation contexts: 21996 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) + +Precision citation contexts: 81.36 +Recall citation contexts: 68.64 +fscore citation contexts: 74.46 + +======= Fulltext structures ======= + +Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +figure_title 96.76 32 23.79 27.29 7058 +reference_citation 58.51 57.28 57.43 57.35 134196 +reference_figure 94.54 61.02 60.73 60.87 19330 +reference_table 99.03 81.26 82.63 81.94 7327 +section_title 94.6 76.6 66.23 71.03 27619 +table_title 98.79 56.85 48.81 52.52 3784 + +all (micro avg.) 90.37 60.27 58.54 59.39 199314 +all (macro avg.) 90.37 60.83 56.6 58.5 199314 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +figure_title 98.67 83.57 62.13 71.27 7058 +reference_citation 61.15 61.56 61.72 61.64 134196 +reference_figure 94.48 62.15 61.86 62 19330 +reference_table 99.03 81.83 83.2 82.51 7327 +section_title 95.27 81.42 70.4 75.51 27619 +table_title 99.48 87.1 74.79 80.48 3784 + +all (micro avg.) 91.35 65.88 63.99 64.92 199314 +all (macro avg.) 91.35 76.27 69.02 72.23 199314 + + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.TableRejectionCounters +************************************************************************************ +------------------------------------------------------------------------------------ + CANNOT_PARSE_LABEL_TO_INT: 162 + CONTENT_SIZE_TOO_SMALL: 99 + CONTENT_WIDTH_TOO_SMALL: 16 + FEW_TOKENS_IN_CONTENT: 1 + EMPTY_LABEL_OR_HEADER_OR_CONTENT: 2268 + HEADER_NOT_STARTS_WITH_TABLE_WORD: 189 + HEADER_NOT_CONSECUTIVE: 497 + HEADER_AND_CONTENT_DIFFERENT_PAGES: 4 + HEADER_AND_CONTENT_INTERSECT: 644 + FEW_TOKENS_IN_HEADER: 5 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.ReferenceMarkerMatcherCounters +************************************************************************************ +------------------------------------------------------------------------------------ + UNMATCHED_REF_MARKERS: 7693 + MATCHED_REF_MARKERS_AFTER_POST_FILTERING: 1559 + STYLE_AUTHORS: 37134 + STYLE_NUMBERED: 51166 + MANY_CANDIDATES: 3839 + MANY_CANDIDATES_AFTER_POST_FILTERING: 303 + NO_CANDIDATES: 15084 + INPUT_REF_STRINGS_CNT: 90669 + MATCHED_REF_MARKERS: 117980 + NO_CANDIDATES_AFTER_POST_FILTERING: 1865 + STYLE_OTHER: 2369 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.label.TaggingLabelImpl +************************************************************************************ +------------------------------------------------------------------------------------ + CITATION_TITLE: 85162 + NAME-HEADER_MIDDLENAME: 9910 + TABLE_FIGDESC: 5365 + NAME-HEADER_SURNAME: 24055 + NAME-CITATION_OTHER: 420511 + CITATION_BOOKTITLE: 4047 + FULLTEXT_SECTION_MARKER: 6 + CITATION_NOTE: 11691 + FULLTEXT_CITATION_MARKER: 178907 + FULLTEXT_TABLE_MARKER: 14781 + CITATION_WEB: 1365 + TABLE_LABEL: 3232 + FULLTEXT_SECTION: 50089 + NAME-HEADER_FORENAME: 24457 + TABLE_CONTENT: 5805 + CITATION_COLLABORATION: 118 + CITATION_ISSUE: 17112 + CITATION_JOURNAL: 80268 + NAME-CITATION_SURNAME: 321645 + TABLE_FIGURE_HEAD: 4537 + FULLTEXT_EQUATION_MARKER: 1538 + CITATION_OTHER: 441478 + FULLTEXT_FIGURE_MARKER: 38407 + CITATION_TECH: 243 + FIGURE_CONTENT: 3210 + FIGURE_LABEL: 5341 + FULLTEXT_EQUATION_LABEL: 1736 + FULLTEXT_EQUATION: 3336 + CITATION_DATE: 88439 + CITATION_AUTHOR: 87183 + FULLTEXT_FIGURE: 14111 + FULLTEXT_TABLE: 11086 + CITATION_EDITOR: 2740 + FULLTEXT_OTHER: 16 + NAME-HEADER_OTHER: 28733 + FIGURE_FIGDESC: 6328 + NAME-HEADER_SUFFIX: 24 + CITATION_VOLUME: 78059 + CITATION_LOCATION: 7091 + NAME-CITATION_SUFFIX: 545 + NAME-HEADER_TITLE: 1091 + CITATION_INSTITUTION: 930 + CITATION_PAGES: 81489 + NAME-HEADER_MARKER: 15087 + NAME-CITATION_FORENAME: 310938 + CITATION_PUBLISHER: 4661 + NAME-CITATION_MIDDLENAME: 61570 + CITATION_PUBNUM: 3258 + FULLTEXT_PARAGRAPH: 371345 + FIGURE_FIGURE_HEAD: 8848 +==================================================================================== +====================================================================================