diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f4e08081bf..b32d30f3c0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,6 +61,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv - ⚠️ We relaxed the escaping requirements for [bracketed patterns](https://docs.jabref.org/setup/citationkeypatterns), which are used for the [citaton key generator](https://docs.jabref.org/advanced/entryeditor#autogenerate-citation-key) and [filename and directory patterns](https://docs.jabref.org/finding-sorting-and-cleaning-entries/filelinks#auto-linking-files). One only needs to write `\"` if a quote sign should be escaped. All other escapings are not necessary (and working) any more. [#11967](https://github.com/JabRef/jabref/pull/11967) - When importing BibTeX data starging from on a PDF, the XMP metadata takes precedence over Grobid data. [#11992](https://github.com/JabRef/jabref/pull/11992) - JabRef now uses TLS 1.2 for all HTTPS connections. [#11852](https://github.com/JabRef/jabref/pull/11852) +- We improved the functionality of getting BibTeX data out of PDF files. [#11999](https://github.com/JabRef/jabref/issues/11999) - We improved the display of long messages in the integrity check dialog. [#11619](https://github.com/JabRef/jabref/pull/11619) - We improved the undo/redo buttons in the main toolbar and main menu to be disabled when there is nothing to undo/redo. [#8807](https://github.com/JabRef/jabref/issues/8807) - We improved the DOI detection in PDF imports. [#11782](https://github.com/JabRef/jabref/pull/11782) diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java index ef4c25321e5..c16c0616948 100644 --- a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java +++ b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java @@ -26,9 +26,13 @@ import org.jabref.model.entry.types.StandardEntryType; import org.jabref.model.strings.StringUtil; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Strings; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; + +import static org.jabref.model.strings.StringUtil.isNullOrEmpty; /** * PdfContentImporter parses data of the first page of the PDF and creates a BibTeX entry. @@ -196,7 +200,8 @@ public ParserResult importDatabase(Path filePath) { List result = new ArrayList<>(1); try (PDDocument document = new XmpUtilReader().loadWithAutomaticDecryption(filePath)) { String firstPageContents = getFirstPageContents(document); - Optional entry = getEntryFromPDFContent(firstPageContents, OS.NEWLINE); + String titleByFontSize = extractTitleFromDocument(document); + Optional entry = getEntryFromPDFContent(firstPageContents, OS.NEWLINE, titleByFontSize); entry.ifPresent(result::add); } catch (EncryptedPdfsNotSupportedException e) { return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported.")); @@ -208,17 +213,120 @@ public ParserResult importDatabase(Path filePath) { return new ParserResult(result); } - // make this method package visible so we can test it - Optional getEntryFromPDFContent(String firstpageContents, String lineSeparator) { - // idea: split[] contains the different lines - // blocks are separated by empty lines - // treat each block - // or do special treatment at authors (which are not broken) - // therefore, we do a line-based and not a block-based splitting - // i points to the current line - // curString (mostly) contains the current block - // the different lines are joined into one and thereby separated by " " + private static String extractTitleFromDocument(PDDocument document) throws IOException { + TitleExtractorByFontSize stripper = new TitleExtractorByFontSize(); + return stripper.getTitleFromFirstPage(document); + } + + private static class TitleExtractorByFontSize extends PDFTextStripper { + + private final List textPositionsList; + + public TitleExtractorByFontSize() { + super(); + this.textPositionsList = new ArrayList<>(); + } + + public String getTitleFromFirstPage(PDDocument document) throws IOException { + this.setStartPage(1); + this.setEndPage(1); + this.writeText(document, new StringWriter()); + return findLargestFontText(textPositionsList); + } + + @Override + protected void writeString(String text, List textPositions) { + textPositionsList.addAll(textPositions); + } + + private boolean isFarAway(TextPosition previous, TextPosition current) { + float XspaceThreshold = 3.0F; + float YspaceThreshold = previous.getFontSizeInPt() * 1.5F; + float Xgap = current.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()); + float Ygap = current.getYDirAdj() - (previous.getYDirAdj() - previous.getHeightDir()); + return Xgap > XspaceThreshold && Ygap > YspaceThreshold; + } + + private boolean isUnwantedText(TextPosition previousTextPosition, TextPosition textPosition) { + if (textPosition == null || previousTextPosition == null) { + return false; + } + // The title usually don't in the bottom 10% of a page. + if ((textPosition.getPageHeight() - textPosition.getYDirAdj()) + < (textPosition.getPageHeight() * 0.1)) { + return true; + } + // The title character usually stay together. + return isFarAway(previousTextPosition, textPosition); + } + + private String findLargestFontText(List textPositions) { + float maxFontSize = 0; + StringBuilder largestFontText = new StringBuilder(); + TextPosition previousTextPosition = null; + for (TextPosition textPosition : textPositions) { + // Exclude unwanted text based on heuristics + if (isUnwantedText(previousTextPosition, textPosition)) { + continue; + } + float fontSize = textPosition.getFontSizeInPt(); + if (fontSize > maxFontSize) { + maxFontSize = fontSize; + largestFontText.setLength(0); + largestFontText.append(textPosition.getUnicode()); + previousTextPosition = textPosition; + } else if (fontSize == maxFontSize) { + if (previousTextPosition != null) { + if (isThereSpace(previousTextPosition, textPosition)) { + largestFontText.append(" "); + } + } + largestFontText.append(textPosition.getUnicode()); + previousTextPosition = textPosition; + } + } + return largestFontText.toString().trim(); + } + + private boolean isThereSpace(TextPosition previous, TextPosition current) { + float XspaceThreshold = 0.5F; + float YspaceThreshold = previous.getFontSizeInPt(); + float Xgap = current.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()); + float Ygap = current.getYDirAdj() - (previous.getYDirAdj() - previous.getHeightDir()); + return Xgap > XspaceThreshold || Ygap > YspaceThreshold; + } + } + /** + * Parses the first page content of a PDF document and extracts bibliographic information such as title, author, + * abstract, keywords, and other relevant metadata. This method processes the content line-by-line and uses + * custom parsing logic to identify and assemble information blocks from academic papers. + * + * idea: split[] contains the different lines, blocks are separated by empty lines, treat each block + * or do special treatment at authors (which are not broken). + * Therefore, we do a line-based and not a block-based splitting i points to the current line + * curString (mostly) contains the current block, + * the different lines are joined into one and thereby separated by " " + * + *

This method follows the structure typically found in academic paper PDFs: + * - First, it attempts to detect the title by font size, if available, or by text position. + * - Authors are then processed line-by-line until reaching the next section. + * - Abstract and keywords, if found, are extracted as they appear on the page. + * - Finally, conference details, DOI, and publication information are parsed from the lower blocks. + * + *

The parsing logic also identifies and categorizes entries based on keywords such as "Abstract" or "Keywords" + * and specific terms that denote sections. Additionally, this method can handle + * publisher-specific formats like Springer or IEEE, extracting data like series, volume, and conference titles. + * + * @param firstpageContents The raw content of the PDF's first page, which may contain metadata and main content. + * @param lineSeparator The line separator used to format and unify line breaks in the text content. + * @param titleByFontSize An optional title string determined by font size; if provided, this overrides the + * default title parsing. + * @return An {@link Optional} containing a {@link BibEntry} with the parsed bibliographic data if extraction + * is successful. Otherwise, an empty {@link Optional}. + */ + @VisibleForTesting + Optional getEntryFromPDFContent(String firstpageContents, String lineSeparator, String titleByFontSize) { String firstpageContentsUnifiedLineBreaks = StringUtil.unifyLineBreaks(firstpageContents, lineSeparator); lines = firstpageContentsUnifiedLineBreaks.split(lineSeparator); @@ -275,8 +383,11 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS // start: title fillCurStringWithNonEmptyLines(); title = streamlineTitle(curString); - curString = ""; // i points to the next non-empty line + curString = ""; + if (!isNullOrEmpty(titleByFontSize)) { + title = titleByFontSize; + } // after title: authors author = null; @@ -393,13 +504,6 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS // IEEE has the conference things at the end publisher = "IEEE"; - // year is extracted by extractYear - // otherwise, we could it determine as follows: - // String yearStr = curString.substring(curString.length()-4); - // if (isYear(yearStr)) { - // year = yearStr; - // } - if (conference == null) { pos = curString.indexOf('$'); if (pos > 0) { diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java index 62443d3accf..5dbd04e72e7 100644 --- a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java +++ b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java @@ -2,7 +2,9 @@ import java.nio.file.Path; import java.util.List; +import java.util.Objects; import java.util.Optional; +import java.util.stream.Stream; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.LinkedFile; @@ -10,12 +12,15 @@ import org.jabref.model.entry.types.StandardEntryType; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import static org.junit.jupiter.api.Assertions.assertEquals; class PdfContentImporterTest { - private PdfContentImporter importer = new PdfContentImporter(); + private final PdfContentImporter importer = new PdfContentImporter(); @Test void doesNotHandleEncryptedPdfs() throws Exception { @@ -65,7 +70,7 @@ void parsingEditorWithoutPagesorSeriesInformation() { Corpus linguistics investigates human language by starting out from large """; - assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContents, "\n")); + assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContents, "\n", "")); } @Test @@ -88,7 +93,7 @@ Smith, Lucy Anna (2014) Mortality in the Ornamental Fish Retail Sector: an Analy UNSPECIFIED Master of Research (MRes) thesis, University of Kent,."""; - assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContents, "\n")); + assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContents, "\n", "")); } @Test @@ -121,6 +126,29 @@ British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 British Journal of Nutrition https://doi.org/10.1017/S0007114507795296 Published online by Cambridge University Press"""; - assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n")); + assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", "")); + } + + @ParameterizedTest + @MethodSource("providePdfData") + void pdfTitleExtraction(String expectedTitle, String filePath) throws Exception { + Path file = Path.of(Objects.requireNonNull(PdfContentImporter.class.getResource(filePath)).toURI()); + List result = importer.importDatabase(file).getDatabase().getEntries(); + assertEquals(Optional.of(expectedTitle), result.getFirst().getTitle()); + } + + private static Stream providePdfData() { + return Stream.of( + Arguments.of("On How We Can Teach – Exploring New Ways in Professional Software Development for Students", "/pdfs/PdfContentImporter/Kriha2018.pdf"), + Arguments.of("JabRef Example for Reference Parsing", "/pdfs/IEEE/ieee-paper.pdf"), + Arguments.of("Paper Title", "/org/jabref/logic/importer/util/LNCS-minimal.pdf"), + Arguments.of("Is Oil the future?", "/pdfs/example-scientificThesisTemplate.pdf"), + Arguments.of("Thesis Title", "/pdfs/thesis-example.pdf"), + Arguments.of("Recovering Trace Links Between Software Documentation And Code", "/pdfs/PdfContentImporter/Keim2024.pdf"), + Arguments.of("On the impact of service-oriented patterns on software evolvability: a controlled experiment and metric-based analysis", "/pdfs/PdfContentImporter/Bogner2019.pdf"), + Arguments.of("Pandemic programming", "/pdfs/PdfContentImporter/Ralph2020.pdf"), + Arguments.of("Do RESTful API design rules have an impact on the understandability of Web APIs?", "/pdfs/PdfContentImporter/Bogner2023.pdf"), + Arguments.of("Adopting microservices and DevOps in the cyber-physical systems domain: A rapid review and case study", "/pdfs/PdfContentImporter/Fritzsch2022.pdf") + ); } } diff --git a/src/test/resources/pdfs/PdfContentImporter/Bogner2019.pdf b/src/test/resources/pdfs/PdfContentImporter/Bogner2019.pdf new file mode 100644 index 00000000000..3e1b425cdf5 Binary files /dev/null and b/src/test/resources/pdfs/PdfContentImporter/Bogner2019.pdf differ diff --git a/src/test/resources/pdfs/PdfContentImporter/Bogner2023.pdf b/src/test/resources/pdfs/PdfContentImporter/Bogner2023.pdf new file mode 100644 index 00000000000..2f4a73b27cf Binary files /dev/null and b/src/test/resources/pdfs/PdfContentImporter/Bogner2023.pdf differ diff --git a/src/test/resources/pdfs/PdfContentImporter/Fritzsch2022.pdf b/src/test/resources/pdfs/PdfContentImporter/Fritzsch2022.pdf new file mode 100644 index 00000000000..59af4d2a81d Binary files /dev/null and b/src/test/resources/pdfs/PdfContentImporter/Fritzsch2022.pdf differ diff --git a/src/test/resources/pdfs/PdfContentImporter/Keim2024.pdf b/src/test/resources/pdfs/PdfContentImporter/Keim2024.pdf new file mode 100644 index 00000000000..a6564a33de7 Binary files /dev/null and b/src/test/resources/pdfs/PdfContentImporter/Keim2024.pdf differ diff --git a/src/test/resources/pdfs/PdfContentImporter/Kriha2018.pdf b/src/test/resources/pdfs/PdfContentImporter/Kriha2018.pdf new file mode 100644 index 00000000000..22fce5db219 Binary files /dev/null and b/src/test/resources/pdfs/PdfContentImporter/Kriha2018.pdf differ diff --git a/src/test/resources/pdfs/PdfContentImporter/Ralph2020.pdf b/src/test/resources/pdfs/PdfContentImporter/Ralph2020.pdf new file mode 100644 index 00000000000..62ac3d33da6 Binary files /dev/null and b/src/test/resources/pdfs/PdfContentImporter/Ralph2020.pdf differ diff --git a/src/test/resources/pdfs/PdfContentImporter/pdfContentImporterTest-pdfs.bib b/src/test/resources/pdfs/PdfContentImporter/pdfContentImporterTest-pdfs.bib new file mode 100644 index 00000000000..d17aadbaef1 --- /dev/null +++ b/src/test/resources/pdfs/PdfContentImporter/pdfContentImporterTest-pdfs.bib @@ -0,0 +1,128 @@ +@inproceedings{Kriha2018Teaching, +author = {Walter Kriha and Tobias Jordine} +abstract = {Requirements and approaches for introductory + courses in software development at universities differ + considerably. There seems to be little consensus on which + languages are a good fit, which methodologies lead to the best + results and especially which goals should be chosen. This paper + takes a look at current approaches and difficulties at our own + faculty – computer science and media at the Stuttgart Media + University – and explores a combination of teaching techniques + which seem to make a difference. The most important change was + to switch to a project-based approach instead of the usual exercises + given to students after a lecture. The second one is the flipped + classroom approach with micro-exams at the beginning of + lectures. The third one is an emphasis on professional tools to be + used during the project. We also try to achieve a concept-based + approach using e.g. modelling techniques to get a better + understanding of source code control and build. And finally, we + work as a team of two lecturers which allows us time to reflect on + how we do things and creates new ideas frequently. None of those + approaches is without problems as we will show, and we have met + with some critique in our own faculty. The paper is explorative, + based mostly on observations and feedback from students, but we + intend to get some quantitative results as well in later publications.} +title = {On How We Can Teach – Exploring New Ways in Professional Software Development for Students} +url = {https://kriha.de/dload/se2paper.pdf} +year = {2018} +file = {Kriha2018.pdf} +doi = {} +} + +@inproceedings{Bogner2023RESTAPI, +author = {Justus Bogner and Sebastian Kotstein and Timo Pfaff} +abstract = {Context + Web APIs are one of the most used ways to expose application functionality on the Web, and their understandability is important for efficiently using the provided resources. While many API design rules exist, empirical evidence for the effectiveness of most rules is lacking. + + Objective + We therefore wanted to study 1) the impact of RESTful API design rules on understandability, 2) if rule violations are also perceived as more difficult to understand, and 3) if demographic attributes like REST-related experience have an influence on this. + + Method + We conducted a controlled Web-based experiment with 105 participants, from both industry and academia and with different levels of experience. Based on a hybrid between a crossover and a between-subjects design, we studied 12 design rules using API snippets in two complementary versions: one that adhered to a rule and one that was a violation of this rule. Participants answered comprehension questions and rated the perceived difficulty. + + Results + For 11 of the 12 rules, we found that violation performed significantly worse than rule for the comprehension tasks. Regarding the subjective ratings, we found significant differences for 9 of the 12 rules, meaning that most violations were subjectively rated as more difficult to understand. Demographics played no role in the comprehension performance for violation. + + Conclusions + Our results provide first empirical evidence for the importance of following design rules to improve the understandability of Web APIs, which is important for researchers, practitioners, and educators.} +title = {Do RESTful API design rules have an impact on the understandability of Web APIs?} +url = {https://link.springer.com/article/10.1007/s10664-023-10367-y} +year = {2023} +file = {Bogner2023.pdf} +doi = {10.1007/s10664-023-10367-y} +} + +@inproceedings{Ralph2020Pandemic, +author = {Paul Ralph and Sebastian Baltes and Gianisa Adisaputri and Richard Torkar and + Vladimir Kovalenko and Marcos Kalinowski andNicole Novielli and Shin Yoo and + Xavier Devroey and Xin Tan and Minghui Zhou and Burak Turhan and Rashina Hoda and + Hideaki Hata andGregorio Robles andAmin Milani Fard and Rana Alkadhi} +abstract = {Context + As a novel coronavirus swept the world in early 2020, thousands of software developers began working from home. Many did so on short notice, under difficult and stressful conditions. + + Objective + This study investigates the effects of the pandemic on developers’ wellbeing and productivity. + + Method + A questionnaire survey was created mainly from existing, validated scales and translated into 12 languages. The data was analyzed using non-parametric inferential statistics and structural equation modeling. + + Results + The questionnaire received 2225 usable responses from 53 countries. Factor analysis supported the validity of the scales and the structural model achieved a good fit (CFI = 0.961, RMSEA = 0.051, SRMR = 0.067). Confirmatory results include: (1) the pandemic has had a negative effect on developers’ wellbeing and productivity; (2) productivity and wellbeing are closely related; (3) disaster preparedness, fear related to the pandemic and home office ergonomics all affect wellbeing or productivity. Exploratory analysis suggests that: (1) women, parents and people with disabilities may be disproportionately affected; (2) different people need different kinds of support. + + Conclusions + To improve employee productivity, software companies should focus on maximizing employee wellbeing and improving the ergonomics of employees’ home offices. Women, parents and disabled persons may require extra support.} +title = {Pandemic programming} +url = {https://link.springer.com/article/10.1007/s10664-020-09875-y} +year = {2020} +file = {Ralph2020.pdf} +doi = {10.1007/s10664-020-09875-y} +} + +@inproceedings{Fritzsch2022CyberPhysical, +author = {Jonas Fritzsch and Justus Bogner and Markus Haug and Ana Cristina Franco da Silva and Carolin Rubner and Matthias Saft and Horst Sauer and Stefan Wagner} +abstract = {The domain of cyber-physical systems (CPS) has recently seen strong growth, for example, due to the rise of the Internet of Things (IoT) in industrial domains, commonly referred to as “Industry 4.0.” However, CPS challenges like the strong hardware focus can impact modern software development practices, especially in the context of modernizing legacy systems. While microservices and DevOps have been widely studied for enterprise applications, there is insufficient coverage for the CPS domain. Our goal is therefore to analyze the peculiarities of such systems regarding challenges and practices for using and migrating towards microservices and DevOps. We conducted a rapid review based on 146 scientific papers, and subsequently validated our findings in an interview-based case study with nine CPS professionals in different business units at Siemens AG. The combined results picture the specifics of microservices and DevOps in the CPS domain. While several differences were revealed that may require adapted methods, many challenges and practices are shared with typical enterprise applications. Our study supports CPS researchers and practitioners with a summary of challenges, practices to address them, and research opportunities.} +title = {Adopting microservices and DevOps in the cyber-physical systems domain: A rapid review and case study} +url = {https://onlinelibrary.wiley.com/doi/10.1002/spe.3169} +year = {2022} +file = {Fritzsch2022.pdf} +doi = {10.1002/spe.3169} +} + +@inproceedings{Bogner2019ServicePatterns, +author = {Justus Bogner and Stefan Wagner and Alfred Zimmermann} +abstract = { +Background +Design patterns are supposed to improve various quality attributes of software systems. However, there is controversial quantitative evidence of this impact. Especially for younger paradigms such as service- and Microservice-based systems, there is a lack of empirical studies. + +Objective +In this study, we focused on the effect of four service-based patterns—namely Process Abstraction, Service Façade, Decomposed Capability, and Event-Driven Messaging—on the evolvability of a system from the viewpoint of inexperienced developers. + +Method +We conducted a controlled experiment with Bachelor students (N = 69). Two functionally equivalent versions of a service-based web shop—one with patterns (treatment group), one without (control group)—had to be changed and extended in three tasks. We measured evolvability by the effectiveness and efficiency of the participants in these tasks. Additionally, we compared both system versions with nine structural maintainability metrics for size, granularity, complexity, cohesion, and coupling. + +Results +Both experiment groups were able to complete a similar number of tasks within the allowed 90 min. Median effectiveness was 1/3. Mean efficiency was 12% higher in the treatment group, but this difference was not statistically significant. Only for the third task, we found statistical support for accepting the alternative hypothesis that the pattern version led to higher efficiency. In the metric analysis, the pattern version had worse measurements for size and granularity while simultaneously having slightly better values for coupling metrics. Complexity and cohesion were not impacted. + +Interpretation +For the experiment, our analysis suggests that the difference in efficiency is stronger with more experienced participants and increased from task to task. With respect to the metrics, the patterns introduce additional volume in the system, but also seem to decrease coupling in some areas. + +Conclusions +Overall, there was no clear evidence for a decisive positive effect of using service-based patterns, neither for the student experiment nor for the metric analysis. This effect might only be visible in an experiment setting with higher initial effort to understand the system or with more experienced developers.} +title = {On the impact of service-oriented patterns on software evolvability: a controlled experiment and metric-based analysis} +url = {https://peerj.com/articles/cs-213/#} +year = {2019} +file = {Bogner2019.pdf} +doi = {10.7717/peerj-cs.213} +} + +@inproceedings{Keim2024TraceLinks, +author = {Jan Keim and Sophie Corallo and Dominik Fuchß and Tobias Hey and Tobias Telge and Anne KoziolekAuthors Info & Claims} +abstract = {Introduction Software development involves creating various artifacts at different levels of abstraction and establishing relationships between them is essential. Traceability link recovery (TLR) automates this process, enhancing software quality by aiding tasks like maintenance and evolution. However, automating TLR is challenging due to semantic gaps resulting from different levels of abstraction. While automated TLR approaches exist for requirements and code, architecture documentation lacks tailored solutions, hindering the preservation of architecture knowledge and design decisions. Methods This paper presents our approach TransArC for TLR between architecture documentation and code, using component-based architecture models as intermediate artifacts to bridge the semantic gap. We create transitive trace links by combining the existing approach ArDoCo for linking architecture documentation to models with our novel approach ArCoTL for linking architecture models to code. + Results We evaluate our approaches with five open-source projects, comparing our results to baseline approaches. The model-to-code TLR approach achieves an average F1-score of 0.98, while the documentation-to-code TLR approach achieves a promising average F1-score of 0.82, significantly outperforming baselines. Conclusion Combining two specialized approaches with an intermediate artifact shows promise for bridging the semantic gap. In future research, we will explore further possibilities for such transitive approaches.} +title = {Recovering Trace Links Between Software Documentation And Code} +url = {https://dl.acm.org/doi/10.1145/3597503.3639130} +year = {2024} +file = {Keim2024.pdf} +doi = {10.1145/3597503.3639130} +} + diff --git a/src/test/resources/pdfs/PdfContentImporter/readme.md b/src/test/resources/pdfs/PdfContentImporter/readme.md new file mode 100644 index 00000000000..55d64f5726e --- /dev/null +++ b/src/test/resources/pdfs/PdfContentImporter/readme.md @@ -0,0 +1 @@ +- Open `src/test/resources/org/jabref/logic/importer/pdfContentImporterTest-pdfs.bib` to see information on the PDFs diff --git a/src/test/resources/pdfs/example-scientificThesisTemplate.pdf b/src/test/resources/pdfs/example-scientificThesisTemplate.pdf new file mode 100644 index 00000000000..7516ef5656e Binary files /dev/null and b/src/test/resources/pdfs/example-scientificThesisTemplate.pdf differ