diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index 6a8eae4c769..974f4403a9d 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -24,6 +24,7 @@ import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.LinkedFile; import org.jabref.model.entry.field.StandardField; +import org.jabref.model.entry.identifier.ArXivIdentifier; import org.jabref.model.entry.identifier.DOI; import org.jabref.model.entry.types.EntryType; import org.jabref.model.entry.types.StandardEntryType; @@ -51,6 +52,8 @@ public class PdfContentImporter extends PdfImporter { private static final Pattern YEAR_EXTRACT_PATTERN = Pattern.compile("\\d{4}"); + private static final int ARXIV_PREFIX_LENGTH = "arxiv:".length(); + // input lines into several lines private String[] lines; @@ -372,11 +375,13 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS String volume = null; String number = null; String pages = null; + String arXivId = null; // year is a class variable as the method extractYear() uses it; String publisher = null; EntryType type = StandardEntryType.InProceedings; if (curString.length() > 4) { + arXivId = getArXivId(null); // special case: possibly conference as first line on the page extractYear(); doi = getDoi(null); @@ -396,6 +401,7 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS } } + arXivId = getArXivId(arXivId); // start: title fillCurStringWithNonEmptyLines(); title = streamlineTitle(curString); @@ -515,6 +521,7 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS } } else { doi = getDoi(doi); + arXivId = getArXivId(arXivId); if ((publisher == null) && curString.contains("IEEE")) { // IEEE has the conference things at the end @@ -539,8 +546,7 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS } } - BibEntry entry = new BibEntry(); - entry.setType(type); + BibEntry entry = new BibEntry(type); // TODO: institution parsing missing @@ -565,6 +571,15 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS if (doi != null) { entry.setField(StandardField.DOI, doi); } + if (arXivId != null) { + entry.setField(StandardField.EPRINT, arXivId); + assert !arXivId.startsWith("arxiv"); + entry.setField(StandardField.EPRINTTYPE, "arXiv"); + + // Quick workaround to avoid wrong year and number parsing + number = null; // "Germany" in org.jabref.logic.importer.fileformat.PdfContentImporterTest.extractArXivFromPage + year = null; // "2408" in org.jabref.logic.importer.fileformat.PdfContentImporterTest.extractArXivFromPage + } if (series != null) { entry.setField(StandardField.SERIES, series); } @@ -600,6 +615,23 @@ private String getDoi(String doi) { return doi; } + private String getArXivId(String arXivId) { + if (arXivId != null) { + return arXivId; + } + + String arXiv = curString.split(" ")[0]; + arXivId = ArXivIdentifier.parse(arXiv).map(ArXivIdentifier::asString).orElse(null); + + if (arXivId == null || curString.length() < arXivId.length() + ARXIV_PREFIX_LENGTH) { + return arXivId; + } + + proceedToNextNonEmptyLine(); + + return arXivId; + } + private String getFirstPageContents(PDDocument document) throws IOException { PDFTextStripper stripper = new PDFTextStripper(); diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java index bb461423a76..992b3c9fcdb 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java @@ -15,6 +15,7 @@ import org.jabref.logic.importer.FetcherException; import org.jabref.logic.importer.ImportFormatPreferences; import org.jabref.logic.importer.ParserResult; +import org.jabref.logic.importer.fetcher.ArXivFetcher; import org.jabref.logic.importer.fetcher.DoiFetcher; import org.jabref.logic.importer.fetcher.isbntobibtex.IsbnFetcher; import org.jabref.logic.importer.util.FileFieldParser; @@ -108,6 +109,13 @@ public ParserResult importDatabase(Path filePath) throws IOException { LOGGER.error("Fetching failed for ISBN \"{}\".", candidate.getField(StandardField.ISBN).get(), e); } } + if (candidate.hasField(StandardField.EPRINT)) { + try { + new ArXivFetcher(importFormatPreferences).performSearchById(candidate.getField(StandardField.EPRINT).get()).ifPresent(fetchedCandidates::add); + } catch (FetcherException e) { + LOGGER.error("Fetching failed for arXiv ID \"{}\".", candidate.getField(StandardField.EPRINT).get(), e); + } + } } candidates.addAll(0, fetchedCandidates); BibEntry entry = new BibEntry(); diff --git a/src/main/java/org/jabref/model/entry/field/StandardField.java b/src/main/java/org/jabref/model/entry/field/StandardField.java index 842883ff682..8605be9f85b 100644 --- a/src/main/java/org/jabref/model/entry/field/StandardField.java +++ b/src/main/java/org/jabref/model/entry/field/StandardField.java @@ -51,9 +51,12 @@ public enum StandardField implements Field { EDITORCTYPE("editorctype", FieldProperty.EDITOR_TYPE), EID("eid"), ENTRYSET("entryset", FieldProperty.MULTIPLE_ENTRY_LINK), + + // For the syntax of a "combined" field, see {@link org.jabref.logic.cleanup.EprintCleanupTest.cleanupCompleteEntry} for examples EPRINT("eprint", FieldProperty.VERBATIM, FieldProperty.IDENTIFIER), EPRINTCLASS("eprintclass"), EPRINTTYPE("eprinttype"), + EVENTDATE("eventdate", FieldProperty.DATE), EVENTTITLE("eventtitle"), EVENTTITLEADDON("eventtitleaddon"), diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index 427c24eb2e9..ea98e5539f0 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -129,6 +129,44 @@ British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", Optional.empty())); } + @Test + void extractArXivFromPage() { + BibEntry entry = new BibEntry(StandardEntryType.TechReport) + .withField(StandardField.AUTHOR, "Filippo Riccaa and Alessandro Marchettob and Andrea Stoccoc") + .withField(StandardField.TITLE, "A Multi-Year Grey Literature Review on AI-assisted Test Automation") + .withField(StandardField.EPRINT, "2408.06224v1") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField((StandardField.KEYWORDS), "Test Automation Artificial Intelligence AI-assisted Test Automation Grey Literature Automated Test Generation Self-Healing Test Scripts"); + + // This is from https://arxiv.org/abs/2408.06224 + String firstPageContent = """ + A Multi-Year Grey Literature Review on AI-assisted Test Automation + + Filippo Riccaa, Alessandro Marchettob and Andrea Stoccoc + + aUniversity of Genoa, Via Balbi 5, Genova, 16126, Italy + bUniversity of Trento, Via Sommarive 9, Trento, 38123, Italy + cTechnical University of Munich, Boltzmannstraße 3, Munich, 85748, Germany + dfortiss GmbH, Guerickestraße 25, Munich, 80805, Germany + + Keywords: + Test Automation + Artificial Intelligence + AI-assisted Test Automation + Grey Literature + Automated Test Generation + Self-Healing Test Scripts + + *Corresponding author + filippo.ricca@unige.it (F. Ricca) + https://person.dibris.unige.it/ricca-filippo/ (F. Ricca) + ORCID(s): 0000-0002-3928-5408 (F. Ricca); 0000-0002-6833-896X (A. Marchetto); 0000-0001-8956-3894 (A. Stocco) + + arXiv:2408.06224v1 [cs.SE] 12 Aug 2024"""; + + assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", Optional.empty())); + } + @ParameterizedTest @MethodSource("providePdfData") void pdfTitleExtraction(String expectedTitle, String filePath) throws Exception {