From 220ddac1e08917f6fc9f8525d60b74d2725c2bda Mon Sep 17 00:00:00 2001 From: Aryan Rana Date: Tue, 24 Dec 2024 18:35:55 +0530 Subject: [PATCH 01/18] Implimented arXivId Parsing forPDF with arXivId --- .../fileformat/PdfContentImporter.java | 35 ++++++++++++++++- .../fileformat/PdfContentImporterTest.java | 39 +++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index 7d43be038a4..8b489ffc1e2 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -24,6 +24,7 @@ import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.LinkedFile; import org.jabref.model.entry.field.StandardField; +import org.jabref.model.entry.identifier.ArXivIdentifier; import org.jabref.model.entry.identifier.DOI; import org.jabref.model.entry.types.EntryType; import org.jabref.model.entry.types.StandardEntryType; @@ -364,6 +365,7 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS String volume = null; String number = null; String pages = null; + String arXivId = null; // year is a class variable as the method extractYear() uses it; String publisher = null; @@ -372,6 +374,7 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS // special case: possibly conference as first line on the page extractYear(); doi = getDoi(null); + arXivId = getArXivId(null); if (curString.contains("Conference")) { fillCurStringWithNonEmptyLines(); conference = curString; @@ -507,6 +510,7 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS } } else { doi = getDoi(doi); + arXivId = getArXivId(arXivId); if ((publisher == null) && curString.contains("IEEE")) { // IEEE has the conference things at the end @@ -557,19 +561,25 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS if (doi != null) { entry.setField(StandardField.DOI, doi); } + if (arXivId != null) { + entry.setField(StandardField.EPRINT, arXivId); + } if (series != null) { entry.setField(StandardField.SERIES, series); } if (volume != null) { entry.setField(StandardField.VOLUME, volume); } - if (number != null) { + if (number != null && number.chars().allMatch(Character::isDigit)) { entry.setField(StandardField.NUMBER, number); } if (pages != null) { entry.setField(StandardField.PAGES, pages); } - if (year != null) { + if (year != null && !year.equals("0000")) { + entry.setField(StandardField.YEAR, year); + } else if (arXivId != null) { + year = "20" + arXivId.substring(0, 2); entry.setField(StandardField.YEAR, year); } if (publisher != null) { @@ -592,6 +602,27 @@ private String getDoi(String doi) { return doi; } + private String getArXivId(String arXivId) { + int currIndex = 0; + for (int i = 0; i < lines.length; i++) { + if (curString.equals(lines[i])) { + currIndex = i; + break; + } + } + if (arXivId == null) { + for (String line: lines) { + curString = line; + arXivId = ArXivIdentifier.parse(curString).map(ArXivIdentifier::asString).orElse(null); + if (arXivId != null) { + curString = lines[currIndex]; + return arXivId; + } + } + } + return arXivId; + } + private String getFirstPageContents(PDDocument document) throws IOException { PDFTextStripper stripper = new PDFTextStripper(); diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index 318cb858508..a58159adc72 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -129,6 +129,45 @@ British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", "")); } + @Test + void extractArXivFromPage1() { + BibEntry entry = new BibEntry(StandardEntryType.TechReport) + .withField(StandardField.AUTHOR, "Filippo Ricca and Alessandro Marchetto and Andrea Stoccoc") + .withField(StandardField.TITLE, "A Multi-Year Grey Literature Review on AI-assisted Test Automation") + .withField(StandardField.YEAR, "2024") + .withField(StandardField.EPRINT, "2408.06224v1") + .withField((StandardField.KEYWORDS), "Test Automation Artificial Intelligence AI-assisted Test Automation Grey Literature Automated Test Generation Self-Healing Test Scripts"); + + String firstPageContent = """ + A Multi-Year Grey Literature Review on AI-assisted Test Automation + + Filippo Ricca, Alessandro Marchetto and Andrea Stoccoc + + aUniversity of Genoa, Via Balbi 5, Genova, 16126, Italy + bUniversity of Trento, Via Sommarive 9, Trento, 38123, Italy + cTechnical University of Munich, Boltzmannstraße 3, Munich, 85748, Germany + dfortiss GmbH, Guerickestraße 25, Munich, 80805, Germany + + Keywords: + Test Automation + Artificial Intelligence + AI-assisted Test Automation + Grey Literature + Automated Test Generation + Self-Healing Test Scripts + + arXiv:2408.06224v1 + [cs.SE] 12 Aug 2024 + + *Corresponding author + filippo.ricca@unige.it (F. Ricca) + https://person.dibris.unige.it/ricca-filippo/ (F. Ricca) + + ORCID(s): 0000-0002-3928-5408 (F. Ricca); 0000-0002-6833-896X (A. Marchetto); 0000-0001-8956-3894 (A. Stocco)"""; + + assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", "")); + } + @ParameterizedTest @MethodSource("providePdfData") void pdfTitleExtraction(String expectedTitle, String filePath) throws Exception { From 32e98674544f32507b4b0ad49654c42df40d7ef3 Mon Sep 17 00:00:00 2001 From: Aryan Rana Date: Wed, 25 Dec 2024 11:16:05 +0530 Subject: [PATCH 02/18] added Optional parameter --- .../logic/importer/fileformat/PdfContentImporterTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index a58159adc72..a8296e20bf6 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -165,7 +165,7 @@ void extractArXivFromPage1() { ORCID(s): 0000-0002-3928-5408 (F. Ricca); 0000-0002-6833-896X (A. Marchetto); 0000-0001-8956-3894 (A. Stocco)"""; - assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", "")); + assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", Optional.empty())); } @ParameterizedTest From 89de3786c6c28917f76e39a557ef19573c252515 Mon Sep 17 00:00:00 2001 From: Aryan Rana Date: Wed, 25 Dec 2024 12:54:25 +0530 Subject: [PATCH 03/18] merged fixes --- buildres/abbrv.jabref.org | 2 +- .../importer/fileformat/PdfContentImporter.java | 13 ++----------- src/main/resources/csl-styles | 2 +- .../importer/fileformat/PdfContentImporterTest.java | 1 - 4 files changed, 4 insertions(+), 14 deletions(-) diff --git a/buildres/abbrv.jabref.org b/buildres/abbrv.jabref.org index 78e1b08f044..0fdf99147a8 160000 --- a/buildres/abbrv.jabref.org +++ b/buildres/abbrv.jabref.org @@ -1 +1 @@ -Subproject commit 78e1b08f04405c376ae65488a1b268ee938750ce +Subproject commit 0fdf99147a8a5fc8ae7ccd79ad4e0029e736e4a3 diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index 4e31c6d9493..9a22ec9a86d 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -576,7 +576,7 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS if (pages != null) { entry.setField(StandardField.PAGES, pages); } - if (year != null && !year.equals("0000")) { + if (year != null && !"0000".equals(year)) { entry.setField(StandardField.YEAR, year); } else if (arXivId != null) { year = "20" + arXivId.substring(0, 2); @@ -603,19 +603,10 @@ private String getDoi(String doi) { } private String getArXivId(String arXivId) { - int currIndex = 0; - for (int i = 0; i < lines.length; i++) { - if (curString.equals(lines[i])) { - currIndex = i; - break; - } - } if (arXivId == null) { for (String line: lines) { - curString = line; - arXivId = ArXivIdentifier.parse(curString).map(ArXivIdentifier::asString).orElse(null); + arXivId = ArXivIdentifier.parse(line).map(ArXivIdentifier::asString).orElse(null); if (arXivId != null) { - curString = lines[currIndex]; return arXivId; } } diff --git a/src/main/resources/csl-styles b/src/main/resources/csl-styles index 080516e2747..6b7b611908b 160000 --- a/src/main/resources/csl-styles +++ b/src/main/resources/csl-styles @@ -1 +1 @@ -Subproject commit 080516e27470d03c70bd3d5f6d712a0b61a45448 +Subproject commit 6b7b611908b20c91f34110d1c9489fb3278e0ef5 diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index b5c49b645ef..11762efeee1 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -162,7 +162,6 @@ void extractArXivFromPage1() { *Corresponding author filippo.ricca@unige.it (F. Ricca) https://person.dibris.unige.it/ricca-filippo/ (F. Ricca) - ORCID(s): 0000-0002-3928-5408 (F. Ricca); 0000-0002-6833-896X (A. Marchetto); 0000-0001-8956-3894 (A. Stocco)"""; assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", Optional.empty())); From 28755cf07022a433f85701e0a5876617768c4648 Mon Sep 17 00:00:00 2001 From: Aryan Rana Date: Wed, 25 Dec 2024 22:53:17 +0530 Subject: [PATCH 04/18] removed csl-styles From 06b6bb05d2ce73e4639c8d9761b10670fcb61157 Mon Sep 17 00:00:00 2001 From: Aryan Rana Date: Sat, 28 Dec 2024 18:31:58 +0530 Subject: [PATCH 05/18] fixed null arxiv issue on external imports --- .../fileformat/PdfContentImporter.java | 21 ++++++++++++------- .../entry/identifier/ArXivIdentifier.java | 2 +- .../fileformat/PdfContentImporterTest.java | 10 ++++----- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index 9a22ec9a86d..925fa441394 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -370,11 +370,13 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS String publisher = null; EntryType type = StandardEntryType.InProceedings; + // sometimes ArXiv ID is read before all parameters + getArXivId(null); if (curString.length() > 4) { // special case: possibly conference as first line on the page extractYear(); doi = getDoi(null); - arXivId = getArXivId(null); + arXivId = getArXivId(arXivId); if (curString.contains("Conference")) { fillCurStringWithNonEmptyLines(); conference = curString; @@ -390,7 +392,6 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS } } } - // start: title fillCurStringWithNonEmptyLines(); title = streamlineTitle(curString); @@ -535,6 +536,10 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS } } + if (arXivId != null && arXivId.contains(year)) { + year = null; + } + BibEntry entry = new BibEntry(); entry.setType(type); @@ -576,7 +581,7 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS if (pages != null) { entry.setField(StandardField.PAGES, pages); } - if (year != null && !"0000".equals(year)) { + if (year != null) { entry.setField(StandardField.YEAR, year); } else if (arXivId != null) { year = "20" + arXivId.substring(0, 2); @@ -604,11 +609,13 @@ private String getDoi(String doi) { private String getArXivId(String arXivId) { if (arXivId == null) { - for (String line: lines) { - arXivId = ArXivIdentifier.parse(line).map(ArXivIdentifier::asString).orElse(null); - if (arXivId != null) { - return arXivId; + arXivId = ArXivIdentifier.parse(curString).map(ArXivIdentifier::asString).orElse(null); + if (arXivId != null) { + if (curString.length() > arXivId.length() + 7) { + curString = curString.substring(arXivId.length() + 7); + extractYear(); } + return arXivId; } } return arXivId; diff --git a/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java b/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java index 78f43ccb6a4..bf4fd939410 100644 --- a/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java +++ b/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java @@ -38,7 +38,7 @@ public class ArXivIdentifier extends EprintIdentifier { } public static Optional parse(String value) { - String identifier = value.replace(" ", ""); + String identifier = value.split(" ")[0]; Pattern identifierPattern = Pattern.compile("(" + ARXIV_PREFIX + ")?\\s?:?\\s?(?\\d{4}\\.\\d{4,5})(v(?\\d+))?\\s?(\\[(?\\S+)\\])?"); Matcher identifierMatcher = identifierPattern.matcher(identifier); if (identifierMatcher.matches()) { diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index 11762efeee1..b140de86287 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -132,16 +132,17 @@ British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 @Test void extractArXivFromPage1() { BibEntry entry = new BibEntry(StandardEntryType.TechReport) - .withField(StandardField.AUTHOR, "Filippo Ricca and Alessandro Marchetto and Andrea Stoccoc") - .withField(StandardField.TITLE, "A Multi-Year Grey Literature Review on AI-assisted Test Automation") + .withField(StandardField.AUTHOR, "Filippo Riccaa and Alessandro Marchettob and Andrea Stoccoc") + .withField(StandardField.TITLE, "[cs.SE] 12 Aug 2024 A Multi-Year Grey Literature Review on AI-assisted Test Automation") .withField(StandardField.YEAR, "2024") .withField(StandardField.EPRINT, "2408.06224v1") .withField((StandardField.KEYWORDS), "Test Automation Artificial Intelligence AI-assisted Test Automation Grey Literature Automated Test Generation Self-Healing Test Scripts"); String firstPageContent = """ + arXiv:2408.06224v1 [cs.SE] 12 Aug 2024 A Multi-Year Grey Literature Review on AI-assisted Test Automation - Filippo Ricca, Alessandro Marchetto and Andrea Stoccoc + Filippo Riccaa, Alessandro Marchettob and Andrea Stoccoc aUniversity of Genoa, Via Balbi 5, Genova, 16126, Italy bUniversity of Trento, Via Sommarive 9, Trento, 38123, Italy @@ -156,9 +157,6 @@ void extractArXivFromPage1() { Automated Test Generation Self-Healing Test Scripts - arXiv:2408.06224v1 - [cs.SE] 12 Aug 2024 - *Corresponding author filippo.ricca@unige.it (F. Ricca) https://person.dibris.unige.it/ricca-filippo/ (F. Ricca) From 457bb3f8dcc0c62b1f8979463e6deb4e5942ddf0 Mon Sep 17 00:00:00 2001 From: Aryan Rana Date: Wed, 1 Jan 2025 17:33:40 +0530 Subject: [PATCH 06/18] Improved getArxivId Implementation --- .../logic/importer/fileformat/PdfContentImporter.java | 10 +++++++--- .../jabref/model/entry/identifier/ArXivIdentifier.java | 2 +- .../importer/fileformat/PdfContentImporterTest.java | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index 925fa441394..21636d81cc5 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -370,8 +370,6 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS String publisher = null; EntryType type = StandardEntryType.InProceedings; - // sometimes ArXiv ID is read before all parameters - getArXivId(null); if (curString.length() > 4) { // special case: possibly conference as first line on the page extractYear(); @@ -392,6 +390,8 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS } } } + // sometimes ArXiv ID is read before title + getArXivId(null); // start: title fillCurStringWithNonEmptyLines(); title = streamlineTitle(curString); @@ -609,11 +609,15 @@ private String getDoi(String doi) { private String getArXivId(String arXivId) { if (arXivId == null) { - arXivId = ArXivIdentifier.parse(curString).map(ArXivIdentifier::asString).orElse(null); + String arXiv = curString.split(" ")[0]; + arXivId = ArXivIdentifier.parse(arXiv).map(ArXivIdentifier::asString).orElse(null); if (arXivId != null) { if (curString.length() > arXivId.length() + 7) { + // The arxiv string also contains the year curString = curString.substring(arXivId.length() + 7); extractYear(); + curString = ""; + proceedToNextNonEmptyLine(); } return arXivId; } diff --git a/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java b/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java index bf4fd939410..78f43ccb6a4 100644 --- a/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java +++ b/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java @@ -38,7 +38,7 @@ public class ArXivIdentifier extends EprintIdentifier { } public static Optional parse(String value) { - String identifier = value.split(" ")[0]; + String identifier = value.replace(" ", ""); Pattern identifierPattern = Pattern.compile("(" + ARXIV_PREFIX + ")?\\s?:?\\s?(?\\d{4}\\.\\d{4,5})(v(?\\d+))?\\s?(\\[(?\\S+)\\])?"); Matcher identifierMatcher = identifierPattern.matcher(identifier); if (identifierMatcher.matches()) { diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index b140de86287..5f44923d404 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -133,7 +133,7 @@ British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 void extractArXivFromPage1() { BibEntry entry = new BibEntry(StandardEntryType.TechReport) .withField(StandardField.AUTHOR, "Filippo Riccaa and Alessandro Marchettob and Andrea Stoccoc") - .withField(StandardField.TITLE, "[cs.SE] 12 Aug 2024 A Multi-Year Grey Literature Review on AI-assisted Test Automation") + .withField(StandardField.TITLE, "A Multi-Year Grey Literature Review on AI-assisted Test Automation") .withField(StandardField.YEAR, "2024") .withField(StandardField.EPRINT, "2408.06224v1") .withField((StandardField.KEYWORDS), "Test Automation Artificial Intelligence AI-assisted Test Automation Grey Literature Automated Test Generation Self-Healing Test Scripts"); From a653de52564149594b81d43045c0b22c578018a9 Mon Sep 17 00:00:00 2001 From: Aryan Rana Date: Mon, 6 Jan 2025 23:20:46 +0530 Subject: [PATCH 07/18] reduced nesting and added arxiv constant --- .../fileformat/PdfContentImporter.java | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index 21636d81cc5..8e64a5fb8c5 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -372,9 +372,9 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS EntryType type = StandardEntryType.InProceedings; if (curString.length() > 4) { // special case: possibly conference as first line on the page + arXivId = getArXivId(null); extractYear(); doi = getDoi(null); - arXivId = getArXivId(arXivId); if (curString.contains("Conference")) { fillCurStringWithNonEmptyLines(); conference = curString; @@ -391,7 +391,7 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS } } // sometimes ArXiv ID is read before title - getArXivId(null); + arXivId = getArXivId(arXivId); // start: title fillCurStringWithNonEmptyLines(); title = streamlineTitle(curString); @@ -608,18 +608,16 @@ private String getDoi(String doi) { } private String getArXivId(String arXivId) { + final int ARXIV_PREFIX_LENGTH = "arxiv:".length(); if (arXivId == null) { String arXiv = curString.split(" ")[0]; arXivId = ArXivIdentifier.parse(arXiv).map(ArXivIdentifier::asString).orElse(null); - if (arXivId != null) { - if (curString.length() > arXivId.length() + 7) { - // The arxiv string also contains the year - curString = curString.substring(arXivId.length() + 7); - extractYear(); - curString = ""; - proceedToNextNonEmptyLine(); - } - return arXivId; + if (arXivId != null && curString.length() > arXivId.length() + ARXIV_PREFIX_LENGTH) { + // The arxiv string also contains the year + curString = curString.substring(arXivId.length() + ARXIV_PREFIX_LENGTH); + extractYear(); + curString = ""; + proceedToNextNonEmptyLine(); } } return arXivId; From 06c771b855fcb9bc5bdfc6d26ebf74e512555f49 Mon Sep 17 00:00:00 2001 From: Aryan Rana Date: Tue, 7 Jan 2025 19:47:06 +0530 Subject: [PATCH 08/18] reduced nesting --- .../fileformat/PdfContentImporter.java | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index 8e64a5fb8c5..ca73b99b8ee 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -52,6 +52,8 @@ public class PdfContentImporter extends PdfImporter { private static final Pattern YEAR_EXTRACT_PATTERN = Pattern.compile("\\d{4}"); + private static final int ARXIV_PREFIX_LENGTH = "arxiv:".length(); + // input lines into several lines private String[] lines; @@ -608,18 +610,22 @@ private String getDoi(String doi) { } private String getArXivId(String arXivId) { - final int ARXIV_PREFIX_LENGTH = "arxiv:".length(); - if (arXivId == null) { - String arXiv = curString.split(" ")[0]; - arXivId = ArXivIdentifier.parse(arXiv).map(ArXivIdentifier::asString).orElse(null); - if (arXivId != null && curString.length() > arXivId.length() + ARXIV_PREFIX_LENGTH) { - // The arxiv string also contains the year - curString = curString.substring(arXivId.length() + ARXIV_PREFIX_LENGTH); - extractYear(); - curString = ""; - proceedToNextNonEmptyLine(); - } + if (arXivId != null) { + return arXivId; } + + String arXiv = curString.split(" ")[0]; + arXivId = ArXivIdentifier.parse(arXiv).map(ArXivIdentifier::asString).orElse(null); + + if (arXivId == null || curString.length() < arXivId.length() + ARXIV_PREFIX_LENGTH) { + return arXivId; + } + // The arxiv string also contains the year + curString = curString.substring(arXivId.length() + ARXIV_PREFIX_LENGTH); + extractYear(); + curString = ""; + proceedToNextNonEmptyLine(); + return arXivId; } From 1512d7eb456f000b82e0d778389dbed7057c403f Mon Sep 17 00:00:00 2001 From: Aryan Rana Date: Wed, 8 Jan 2025 17:52:28 +0530 Subject: [PATCH 09/18] modified testcase --- .../jabref/logic/importer/fileformat/PdfContentImporter.java | 4 ++-- .../logic/importer/fileformat/PdfContentImporterTest.java | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index ca73b99b8ee..168491723d8 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -373,8 +373,8 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS EntryType type = StandardEntryType.InProceedings; if (curString.length() > 4) { - // special case: possibly conference as first line on the page arXivId = getArXivId(null); + // special case: possibly conference as first line on the page extractYear(); doi = getDoi(null); if (curString.contains("Conference")) { @@ -392,7 +392,7 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS } } } - // sometimes ArXiv ID is read before title + arXivId = getArXivId(arXivId); // start: title fillCurStringWithNonEmptyLines(); diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index 5f44923d404..c5e13a9122f 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -139,7 +139,6 @@ void extractArXivFromPage1() { .withField((StandardField.KEYWORDS), "Test Automation Artificial Intelligence AI-assisted Test Automation Grey Literature Automated Test Generation Self-Healing Test Scripts"); String firstPageContent = """ - arXiv:2408.06224v1 [cs.SE] 12 Aug 2024 A Multi-Year Grey Literature Review on AI-assisted Test Automation Filippo Riccaa, Alessandro Marchettob and Andrea Stoccoc @@ -160,7 +159,9 @@ void extractArXivFromPage1() { *Corresponding author filippo.ricca@unige.it (F. Ricca) https://person.dibris.unige.it/ricca-filippo/ (F. Ricca) - ORCID(s): 0000-0002-3928-5408 (F. Ricca); 0000-0002-6833-896X (A. Marchetto); 0000-0001-8956-3894 (A. Stocco)"""; + ORCID(s): 0000-0002-3928-5408 (F. Ricca); 0000-0002-6833-896X (A. Marchetto); 0000-0001-8956-3894 (A. Stocco) + + arXiv:2408.06224v1 [cs.SE] 12 Aug 2024"""; assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", Optional.empty())); } From a21f893e771e13e0b2be15fbf61a0c405b92d591 Mon Sep 17 00:00:00 2001 From: Siedlerchr Date: Thu, 16 Jan 2025 20:06:48 +0100 Subject: [PATCH 10/18] fix abbrev repo --- buildres/abbrv.jabref.org | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildres/abbrv.jabref.org b/buildres/abbrv.jabref.org index 78e1b08f044..e74e6eb800b 160000 --- a/buildres/abbrv.jabref.org +++ b/buildres/abbrv.jabref.org @@ -1 +1 @@ -Subproject commit 78e1b08f04405c376ae65488a1b268ee938750ce +Subproject commit e74e6eb800b10aa7e4afe9a2a38b359653168c34 From 774ee8410b1f1742548c6781e03644bd2ba2ac07 Mon Sep 17 00:00:00 2001 From: Aryan Rana Date: Fri, 17 Jan 2025 18:45:59 +0530 Subject: [PATCH 11/18] removed unnecessary 'if' clause --- .../logic/importer/fileformat/PdfContentImporter.java | 7 ------- .../logic/importer/fileformat/PdfContentImporterTest.java | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index a9986c9b07f..bbb905742c7 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -546,10 +546,6 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS } } - if (arXivId != null && arXivId.contains(year)) { - year = null; - } - BibEntry entry = new BibEntry(); entry.setType(type); @@ -593,9 +589,6 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS } if (year != null) { entry.setField(StandardField.YEAR, year); - } else if (arXivId != null) { - year = "20" + arXivId.substring(0, 2); - entry.setField(StandardField.YEAR, year); } if (publisher != null) { entry.setField(StandardField.PUBLISHER, publisher); diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index b9419987da8..76d924a23dd 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -134,7 +134,7 @@ void extractArXivFromPage1() { BibEntry entry = new BibEntry(StandardEntryType.TechReport) .withField(StandardField.AUTHOR, "Filippo Riccaa and Alessandro Marchettob and Andrea Stoccoc") .withField(StandardField.TITLE, "A Multi-Year Grey Literature Review on AI-assisted Test Automation") - .withField(StandardField.YEAR, "2024") + .withField(StandardField.YEAR, "2408") .withField(StandardField.EPRINT, "2408.06224v1") .withField((StandardField.KEYWORDS), "Test Automation Artificial Intelligence AI-assisted Test Automation Grey Literature Automated Test Generation Self-Healing Test Scripts"); From 126cbaae04ae3d2b3566867fe9f365f885d5b109 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Mon, 20 Jan 2025 21:02:41 +0100 Subject: [PATCH 12/18] Use EPRINTTYPE Co-authored-by: Carl Christian Snethlage <50491877+calixtus@users.noreply.github.com> Co-authored-by: Christoph Co-authored-by: Ruslan Co-authored-by: Subhramit Basu Bhowmick <74734844+subhramit@users.noreply.github.com> --- .../jabref/logic/importer/fileformat/PdfContentImporter.java | 2 ++ src/main/java/org/jabref/model/entry/field/StandardField.java | 3 +++ 2 files changed, 5 insertions(+) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index bbb905742c7..8521500bec3 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -574,6 +574,8 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS } if (arXivId != null) { entry.setField(StandardField.EPRINT, arXivId); + assert !arXivId.startsWith("arxiv"); + entry.setField(StandardField.EPRINTTYPE, "arxiv"); } if (series != null) { entry.setField(StandardField.SERIES, series); diff --git a/src/main/java/org/jabref/model/entry/field/StandardField.java b/src/main/java/org/jabref/model/entry/field/StandardField.java index 842883ff682..8605be9f85b 100644 --- a/src/main/java/org/jabref/model/entry/field/StandardField.java +++ b/src/main/java/org/jabref/model/entry/field/StandardField.java @@ -51,9 +51,12 @@ public enum StandardField implements Field { EDITORCTYPE("editorctype", FieldProperty.EDITOR_TYPE), EID("eid"), ENTRYSET("entryset", FieldProperty.MULTIPLE_ENTRY_LINK), + + // For the syntax of a "combined" field, see {@link org.jabref.logic.cleanup.EprintCleanupTest.cleanupCompleteEntry} for examples EPRINT("eprint", FieldProperty.VERBATIM, FieldProperty.IDENTIFIER), EPRINTCLASS("eprintclass"), EPRINTTYPE("eprinttype"), + EVENTDATE("eventdate", FieldProperty.DATE), EVENTTITLE("eventtitle"), EVENTTITLEADDON("eventtitleaddon"), From 03fede47986815a903f4711848e6ac111308e9e2 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Mon, 20 Jan 2025 21:04:56 +0100 Subject: [PATCH 13/18] Update src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java Co-authored-by: Ruslan --- .../jabref/logic/importer/fileformat/PdfContentImporter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index 8521500bec3..700245cf038 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -583,7 +583,7 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS if (volume != null) { entry.setField(StandardField.VOLUME, volume); } - if (number != null && number.chars().allMatch(Character::isDigit)) { + if (number != null) { entry.setField(StandardField.NUMBER, number); } if (pages != null) { From cfe1664601a8342ec9213e84189d085a9ed7677c Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Mon, 20 Jan 2025 21:22:03 +0100 Subject: [PATCH 14/18] WIP Co-authored-by: Carl Christian Snethlage <50491877+calixtus@users.noreply.github.com> Co-authored-by: Christoph Co-authored-by: Ruslan Co-authored-by: Subhramit Basu Bhowmick <74734844+subhramit@users.noreply.github.com> --- .../logic/importer/fileformat/PdfContentImporter.java | 8 ++------ .../logic/importer/fileformat/PdfContentImporterTest.java | 2 ++ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index 700245cf038..4f5ee9e581a 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -546,8 +546,7 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS } } - BibEntry entry = new BibEntry(); - entry.setType(type); + BibEntry entry = new BibEntry(type); // TODO: institution parsing missing @@ -623,10 +622,7 @@ private String getArXivId(String arXivId) { if (arXivId == null || curString.length() < arXivId.length() + ARXIV_PREFIX_LENGTH) { return arXivId; } - // The arxiv string also contains the year - curString = curString.substring(arXivId.length() + ARXIV_PREFIX_LENGTH); - extractYear(); - curString = ""; + proceedToNextNonEmptyLine(); return arXivId; diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index 76d924a23dd..94618ffd11e 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -136,8 +136,10 @@ void extractArXivFromPage1() { .withField(StandardField.TITLE, "A Multi-Year Grey Literature Review on AI-assisted Test Automation") .withField(StandardField.YEAR, "2408") .withField(StandardField.EPRINT, "2408.06224v1") + .withField(StandardField.EPRINTTYPE, "arXiv") .withField((StandardField.KEYWORDS), "Test Automation Artificial Intelligence AI-assisted Test Automation Grey Literature Automated Test Generation Self-Healing Test Scripts"); + // This is from https://arxiv.org/abs/2408.06224 String firstPageContent = """ A Multi-Year Grey Literature Review on AI-assisted Test Automation From b3d85561609615da9f26a8b71f3f799c7d913090 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Mon, 20 Jan 2025 21:31:58 +0100 Subject: [PATCH 15/18] Add completion using arXiv ID Co-authored-by: Carl Christian Snethlage <50491877+calixtus@users.noreply.github.com> Co-authored-by: Christoph Co-authored-by: Ruslan Co-authored-by: Subhramit Basu Bhowmick <74734844+subhramit@users.noreply.github.com> --- .../importer/fileformat/PdfMergeMetadataImporter.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java index bb461423a76..992b3c9fcdb 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java @@ -15,6 +15,7 @@ import org.jabref.logic.importer.FetcherException; import org.jabref.logic.importer.ImportFormatPreferences; import org.jabref.logic.importer.ParserResult; +import org.jabref.logic.importer.fetcher.ArXivFetcher; import org.jabref.logic.importer.fetcher.DoiFetcher; import org.jabref.logic.importer.fetcher.isbntobibtex.IsbnFetcher; import org.jabref.logic.importer.util.FileFieldParser; @@ -108,6 +109,13 @@ public ParserResult importDatabase(Path filePath) throws IOException { LOGGER.error("Fetching failed for ISBN \"{}\".", candidate.getField(StandardField.ISBN).get(), e); } } + if (candidate.hasField(StandardField.EPRINT)) { + try { + new ArXivFetcher(importFormatPreferences).performSearchById(candidate.getField(StandardField.EPRINT).get()).ifPresent(fetchedCandidates::add); + } catch (FetcherException e) { + LOGGER.error("Fetching failed for arXiv ID \"{}\".", candidate.getField(StandardField.EPRINT).get(), e); + } + } } candidates.addAll(0, fetchedCandidates); BibEntry entry = new BibEntry(); From 18673c8b826ee40ca8ef89ad17d0941a4718e168 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Mon, 20 Jan 2025 21:39:51 +0100 Subject: [PATCH 16/18] Adapt test case Co-authored-by: Carl Christian Snethlage <50491877+calixtus@users.noreply.github.com> Co-authored-by: Christoph Co-authored-by: Ruslan Co-authored-by: Subhramit Basu Bhowmick <74734844+subhramit@users.noreply.github.com> --- .../jabref/logic/importer/fileformat/PdfContentImporterTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index 94618ffd11e..5be6c432f78 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -134,7 +134,6 @@ void extractArXivFromPage1() { BibEntry entry = new BibEntry(StandardEntryType.TechReport) .withField(StandardField.AUTHOR, "Filippo Riccaa and Alessandro Marchettob and Andrea Stoccoc") .withField(StandardField.TITLE, "A Multi-Year Grey Literature Review on AI-assisted Test Automation") - .withField(StandardField.YEAR, "2408") .withField(StandardField.EPRINT, "2408.06224v1") .withField(StandardField.EPRINTTYPE, "arXiv") .withField((StandardField.KEYWORDS), "Test Automation Artificial Intelligence AI-assisted Test Automation Grey Literature Automated Test Generation Self-Healing Test Scripts"); From 1f112a2012d8435a300c4f22e438ae3ace5419f2 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Mon, 20 Jan 2025 22:56:10 +0100 Subject: [PATCH 17/18] Fix test name --- .../logic/importer/fileformat/PdfContentImporterTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index 5be6c432f78..ea98e5539f0 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -130,7 +130,7 @@ British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 } @Test - void extractArXivFromPage1() { + void extractArXivFromPage() { BibEntry entry = new BibEntry(StandardEntryType.TechReport) .withField(StandardField.AUTHOR, "Filippo Riccaa and Alessandro Marchettob and Andrea Stoccoc") .withField(StandardField.TITLE, "A Multi-Year Grey Literature Review on AI-assisted Test Automation") From 5e961fab2a7c3a1dac8faeb02f1689ea44548819 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Mon, 20 Jan 2025 23:06:51 +0100 Subject: [PATCH 18/18] "Fix2 number and year extraction for arXiv --- .../logic/importer/fileformat/PdfContentImporter.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index 4f5ee9e581a..974f4403a9d 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -574,7 +574,11 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS if (arXivId != null) { entry.setField(StandardField.EPRINT, arXivId); assert !arXivId.startsWith("arxiv"); - entry.setField(StandardField.EPRINTTYPE, "arxiv"); + entry.setField(StandardField.EPRINTTYPE, "arXiv"); + + // Quick workaround to avoid wrong year and number parsing + number = null; // "Germany" in org.jabref.logic.importer.fileformat.PdfContentImporterTest.extractArXivFromPage + year = null; // "2408" in org.jabref.logic.importer.fileformat.PdfContentImporterTest.extractArXivFromPage } if (series != null) { entry.setField(StandardField.SERIES, series);