Implimented arXivId Parsing for PDF with arXivId (#12335)

* Implimented arXivId Parsing forPDF with arXivId * added Optional parameter * merged fixes * removed csl-styles * fixed null arxiv issue on external imports * Improved getArxivId Implementation * reduced nesting and added arxiv constant * reduced nesting * modified testcase * fix abbrev repo * removed unnecessary 'if' clause * Use EPRINTTYPE Co-authored-by: Carl Christian Snethlage <50491877+calixtus@users.noreply.github.com> Co-authored-by: Christoph <siedlerkiller@gmail.com> Co-authored-by: Ruslan <ruslanpopov1512@gmail.com> Co-authored-by: Subhramit Basu Bhowmick <74734844+subhramit@users.noreply.github.com> * Update src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java Co-authored-by: Ruslan <ruslanpopov1512@gmail.com> * WIP Co-authored-by: Carl Christian Snethlage <50491877+calixtus@users.noreply.github.com> Co-authored-by: Christoph <siedlerkiller@gmail.com> Co-authored-by: Ruslan <ruslanpopov1512@gmail.com> Co-authored-by: Subhramit Basu Bhowmick <74734844+subhramit@users.noreply.github.com> * Add completion using arXiv ID Co-authored-by: Carl Christian Snethlage <50491877+calixtus@users.noreply.github.com> Co-authored-by: Christoph <siedlerkiller@gmail.com> Co-authored-by: Ruslan <ruslanpopov1512@gmail.com> Co-authored-by: Subhramit Basu Bhowmick <74734844+subhramit@users.noreply.github.com> * Adapt test case Co-authored-by: Carl Christian Snethlage <50491877+calixtus@users.noreply.github.com> Co-authored-by: Christoph <siedlerkiller@gmail.com> Co-authored-by: Ruslan <ruslanpopov1512@gmail.com> Co-authored-by: Subhramit Basu Bhowmick <74734844+subhramit@users.noreply.github.com> * Fix test name * "Fix2 number and year extraction for arXiv --------- Co-authored-by: Siedlerchr <siedlerkiller@gmail.com> Co-authored-by: Oliver Kopp <kopp.dev@gmail.com> Co-authored-by: Carl Christian Snethlage <50491877+calixtus@users.noreply.github.com> Co-authored-by: Ruslan <ruslanpopov1512@gmail.com> Co-authored-by: Subhramit Basu Bhowmick <74734844+subhramit@users.noreply.github.com>
JabRef · Jan 20, 2025 · 4aa313d · 4aa313d
1 parent add35ab
commit 4aa313d
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 2 deletions.
diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java
@@ -24,6 +24,7 @@
 import org.jabref.model.entry.BibEntry;
 import org.jabref.model.entry.LinkedFile;
 import org.jabref.model.entry.field.StandardField;
+import org.jabref.model.entry.identifier.ArXivIdentifier;
 import org.jabref.model.entry.identifier.DOI;
 import org.jabref.model.entry.types.EntryType;
 import org.jabref.model.entry.types.StandardEntryType;
@@ -51,6 +52,8 @@ public class PdfContentImporter extends PdfImporter {
 
     private static final Pattern YEAR_EXTRACT_PATTERN = Pattern.compile("\\d{4}");
 
+    private static final int ARXIV_PREFIX_LENGTH = "arxiv:".length();
+
     // input lines into several lines
     private String[] lines;
 
@@ -372,11 +375,13 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         String volume = null;
         String number = null;
         String pages = null;
+        String arXivId = null;
         // year is a class variable as the method extractYear() uses it;
         String publisher = null;
 
         EntryType type = StandardEntryType.InProceedings;
         if (curString.length() > 4) {
+            arXivId = getArXivId(null);
             // special case: possibly conference as first line on the page
             extractYear();
             doi = getDoi(null);
@@ -396,6 +401,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
             }
         }
 
+        arXivId = getArXivId(arXivId);
         // start: title
         fillCurStringWithNonEmptyLines();
         title = streamlineTitle(curString);
@@ -515,6 +521,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
                 }
             } else {
                 doi = getDoi(doi);
+                arXivId = getArXivId(arXivId);
 
                 if ((publisher == null) && curString.contains("IEEE")) {
                     // IEEE has the conference things at the end
@@ -539,8 +546,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
             }
         }
 
-        BibEntry entry = new BibEntry();
-        entry.setType(type);
+        BibEntry entry = new BibEntry(type);
 
         // TODO: institution parsing missing
 
@@ -565,6 +571,15 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         if (doi != null) {
             entry.setField(StandardField.DOI, doi);
         }
+        if (arXivId != null) {
+            entry.setField(StandardField.EPRINT, arXivId);
+            assert !arXivId.startsWith("arxiv");
+            entry.setField(StandardField.EPRINTTYPE, "arXiv");
+
+            // Quick workaround to avoid wrong year and number parsing
+            number = null; // "Germany" in org.jabref.logic.importer.fileformat.PdfContentImporterTest.extractArXivFromPage
+            year = null; // "2408" in org.jabref.logic.importer.fileformat.PdfContentImporterTest.extractArXivFromPage
+        }
         if (series != null) {
             entry.setField(StandardField.SERIES, series);
         }
@@ -600,6 +615,23 @@ private String getDoi(String doi) {
         return doi;
     }
 
+    private String getArXivId(String arXivId) {
+        if (arXivId != null) {
+            return arXivId;
+        }
+
+        String arXiv = curString.split(" ")[0];
+        arXivId = ArXivIdentifier.parse(arXiv).map(ArXivIdentifier::asString).orElse(null);
+
+        if (arXivId == null || curString.length() < arXivId.length() + ARXIV_PREFIX_LENGTH) {
+            return arXivId;
+        }
+
+        proceedToNextNonEmptyLine();
+
+        return arXivId;
+    }
+
     private String getFirstPageContents(PDDocument document) throws IOException {
         PDFTextStripper stripper = new PDFTextStripper();
 

diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java
@@ -15,6 +15,7 @@
 import org.jabref.logic.importer.FetcherException;
 import org.jabref.logic.importer.ImportFormatPreferences;
 import org.jabref.logic.importer.ParserResult;
+import org.jabref.logic.importer.fetcher.ArXivFetcher;
 import org.jabref.logic.importer.fetcher.DoiFetcher;
 import org.jabref.logic.importer.fetcher.isbntobibtex.IsbnFetcher;
 import org.jabref.logic.importer.util.FileFieldParser;
@@ -108,6 +109,13 @@ public ParserResult importDatabase(Path filePath) throws IOException {
                     LOGGER.error("Fetching failed for ISBN \"{}\".", candidate.getField(StandardField.ISBN).get(), e);
                 }
             }
+            if (candidate.hasField(StandardField.EPRINT)) {
+                try {
+                    new ArXivFetcher(importFormatPreferences).performSearchById(candidate.getField(StandardField.EPRINT).get()).ifPresent(fetchedCandidates::add);
+                } catch (FetcherException e) {
+                    LOGGER.error("Fetching failed for arXiv ID \"{}\".", candidate.getField(StandardField.EPRINT).get(), e);
+                }
+            }
         }
         candidates.addAll(0, fetchedCandidates);
         BibEntry entry = new BibEntry();

diff --git a/src/main/java/org/jabref/model/entry/field/StandardField.java b/src/main/java/org/jabref/model/entry/field/StandardField.java
@@ -51,9 +51,12 @@ public enum StandardField implements Field {
     EDITORCTYPE("editorctype", FieldProperty.EDITOR_TYPE),
     EID("eid"),
     ENTRYSET("entryset", FieldProperty.MULTIPLE_ENTRY_LINK),
+
+    // For the syntax of a "combined" field, see {@link org.jabref.logic.cleanup.EprintCleanupTest.cleanupCompleteEntry} for examples
     EPRINT("eprint", FieldProperty.VERBATIM, FieldProperty.IDENTIFIER),
     EPRINTCLASS("eprintclass"),
     EPRINTTYPE("eprinttype"),
+
     EVENTDATE("eventdate", FieldProperty.DATE),
     EVENTTITLE("eventtitle"),
     EVENTTITLEADDON("eventtitleaddon"),

diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java
@@ -129,6 +129,44 @@ British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296
         assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", Optional.empty()));
     }
 
+    @Test
+    void extractArXivFromPage() {
+        BibEntry entry = new BibEntry(StandardEntryType.TechReport)
+                .withField(StandardField.AUTHOR, "Filippo Riccaa and Alessandro Marchettob and Andrea Stoccoc")
+                .withField(StandardField.TITLE, "A Multi-Year Grey Literature Review on AI-assisted Test Automation")
+                .withField(StandardField.EPRINT, "2408.06224v1")
+                .withField(StandardField.EPRINTTYPE, "arXiv")
+                .withField((StandardField.KEYWORDS), "Test Automation Artificial Intelligence AI-assisted Test Automation Grey Literature Automated Test Generation Self-Healing Test Scripts");
+
+        // This is from https://arxiv.org/abs/2408.06224
+        String firstPageContent = """
+                A Multi-Year Grey Literature Review on AI-assisted Test Automation
+
+                Filippo Riccaa, Alessandro Marchettob and Andrea Stoccoc
+
+                aUniversity of Genoa, Via Balbi 5, Genova, 16126, Italy
+                bUniversity of Trento, Via Sommarive 9, Trento, 38123, Italy
+                cTechnical University of Munich, Boltzmannstraße 3, Munich, 85748, Germany
+                dfortiss GmbH, Guerickestraße 25, Munich, 80805, Germany
+
+                Keywords:
+                Test Automation
+                Artificial Intelligence
+                AI-assisted Test Automation
+                Grey Literature
+                Automated Test Generation
+                Self-Healing Test Scripts
+
+                *Corresponding author
+                filippo.ricca@unige.it (F. Ricca)
+                https://person.dibris.unige.it/ricca-filippo/ (F. Ricca)
+                ORCID(s): 0000-0002-3928-5408 (F. Ricca); 0000-0002-6833-896X (A. Marchetto); 0000-0001-8956-3894 (A. Stocco)
+
+                arXiv:2408.06224v1 [cs.SE] 12 Aug 2024""";
+
+        assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", Optional.empty()));
+    }
+
     @ParameterizedTest
     @MethodSource("providePdfData")
     void pdfTitleExtraction(String expectedTitle, String filePath) throws Exception {