-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtests.py
131 lines (110 loc) · 6 KB
/
tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from os import getcwd, makedirs
from os.path import join
from extract import single_pdf_extract_process, CWD
from analyze import single_regex_label
import sys
import argparse
JAR = False
llm_available = False
try:
from llm_operations.run_llm_with_abstract import assistant
from llm_operations.gpt_utils import extract_json_from_response
llm_available = True
except:
pass
from labelling_app.app import load_json, save_json
import unittest
extract_fail_message = """
No captions/figures extracted from the PDF!
Potential causes:
- missing input pdf in test_data
- FigureExtractor not built properly - make sure user permissions on pdffigures2/ set right and `sbt build`
has been successfully run in the `pdffigures2` directory
- some bug in the folder structure/os mkdirs
- sbt can fail to install dependencies like ivy on managed system (like university computers)
"""
llm_fail_message = """
No captions analyzed by GPT!
Potential causes:
- no OPENAI_API_KEY env variable set
- no internet connection
- timeout on the API request
"""
abstract = """The mesostructure of porous electrodes used in lithium-ion batteries strongly influences cell performance. Accurate imaging of the
distribution of phases in these electrodes would allow this relationship to be better understood through simulation. However,
imaging the nanoscale features in these components is challenging. While scanning electron microscopy is able to achieve the
required resolution, it has well established difficulties imaging porous media. This is because the flat imaging planes prepared using
focused ion beam milling will intersect with the pores, which makes the images hard to interpret as the inside walls of the pores are
observed. It is common to infiltrate porous media with resin prior to imaging to help resolve this issue, but both the nanoscale
porosity and the chemical similarity of the resins to the battery materials undermine the utility of this approach for most electrodes.
In this study, a technique is demonstrated which uses in situ infiltration of platinum to fill the pores and thus enhance their contrast
during imaging. Reminiscent of the Japanese art of repairing cracked ceramics with precious metals, this technique is referred to as
the kintsugi method. The images resulting from applying this technique to a conventional porous cathode are presented and then
segmented using a multi-channel convolutional method. We show that while some cracks in active material particles were empty,
others appear to be filled (perhaps with the carbon binder phase), which will have implications for the rate performance of the cell.
Energy dispersive X-ray spectroscopy was used to validate the distribution of phases resulting from image analysis, which also
suggested a graded distribution of the binder relative to the carbon additive. The equipment required to use the kintsugi method is
commonly available in major research facilities and so we hope that this method will be rapidly adopted to improve the imaging of
electrode materials and porous media in general."""
class Tests(unittest.TestCase):
# add scraping test?
def test_extraction(self):
"""Run figure/caption extractor on the test pdf `test_data/tmp.pdf`, splitting extracted figures
into subfigures as well. Outputs original and sub-figures to `test_data/processed/` folder. Fails
if no figures extracted."""
target_dir = join(CWD, "test_data/")
target_pdf = join(target_dir, "tmp.pdf")
out_img_dir = join(CWD, "test_data/out/imgs/")
out_data_dir = join(CWD, "test_data/out/")
processed_dir = join(CWD, "test_data/processed/")
for new_dir in [target_dir, out_img_dir, processed_dir]:
makedirs(new_dir, exist_ok=True)
print(JAR)
captions, _ = single_pdf_extract_process(
target_pdf, out_img_dir, out_data_dir, processed_dir, use_jar=JAR
)
assert len(captions) > 0, extract_fail_message
def test_regex(self):
"""Run basic string matching caption/instrument analysis on the captions of the pdf."""
labels_path = join(CWD, "test_data/analyze/labels.json")
captions_path = join(CWD, "test_data/analyze/captions.json")
out = single_regex_label(labels_path, captions_path)
li = out["regex"]
assert (
len(li) > 0
), "Failed to analyze captions with regex - files may be in the wrong place."
for figure_eval in li:
assert figure_eval["isMicrograph"] == True, "Incorrect regex eval"
def test_gpt(self):
"""Run LLM-based caption analysis on the captions. Requires the OPENAI_API_KEY environment
variable to be set."""
if llm_available is False:
assert (
llm_available
), "Need to set OPENAI_API_KEY environment variable before LLM labelling"
labels_path = join(CWD, "test_data/analyze/labels.json")
captions_path = join(CWD, "test_data/analyze/captions.json")
labels_data = load_json(labels_path)
captions_data = load_json(captions_path)
llm_labels = []
for item in captions_data:
caption = item["caption"]
figure_type = item["figType"]
if figure_type == "Figure":
response = assistant(abstract, caption)
response_data = extract_json_from_response(response)
response_data["figure"] = item["name"]
llm_labels.append(response_data)
labels_data["llm"] = llm_labels
save_json("test_data/analyze/labels.json", labels_data)
assert len(llm_labels) > 0, llm_fail_message
if __name__ == "__main__":
# arg parse + unit test from chepner @ https://stackoverflow.com/questions/73625874/argparse-and-unittest-main
parser = argparse.ArgumentParser()
parser.add_argument(
"-jar", action="store_true", help="Use .jar of pdffigures not sbt"
)
args, remaining = parser.parse_known_args()
JAR = args.jar
remaining.insert(0, sys.argv[0])
unittest.main(argv=remaining)