-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtopics.py
370 lines (342 loc) · 17.2 KB
/
topics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
import config
import pandas as pd
import numpy as np
import files
import os
import markdown
import yaml
import log
import software
import solutions
import tasks
import static_files
from datetime import datetime
import re
import shell
import itertools
import glob
df = None
# Load from disk a table of all the topics in the database.
# Cache the result so that later runs are faster.
def all ():
global df
if df is None:
json = [ ]
for topic_file in files.just_docs( os.listdir( config.topics_folder ) ):
if topic_file == 'README.md':
continue
full_filename = os.path.join( config.topics_folder, topic_file )
markdown_content = markdown.read_doc( full_filename )
metadata, content = yaml.split_string( markdown_content )
content += files.modification_text( full_filename )
next = {
'topic name' : files.without_extension( topic_file ),
'topic filename' : config.relativize_path( full_filename ),
'permalink' : config.blogify( files.without_extension( topic_file ) ),
'content' : content,
'raw content' : markdown_content
}
for key, value in metadata.items():
next[key] = value
json.append( next )
df = pd.DataFrame( json )
# Add links to topic pages
df['markdown link'] = df['topic name'].apply(
lambda name: f'[{name}](../{config.blogify(name)})' )
return df
# Clear cache so we're forced to re-examine what's on disk.
# Useful if you just added a topic.
def clear_cache ():
global df
df = None
def _pair_to_title ( package, libraries='solution' ):
if libraries == 'solution':
return f'pure {package}'
else:
return f'{package} {libraries}'
# Objects of the following class represent an individual row in the topics df.
class Topic:
# The row may be an integer index, a topic name, or a row from the df.
# Consider each option below.
def __init__ ( self, row ):
if isinstance( row, pd.Series ):
self.index = row.name
elif isinstance( row, str ):
self.index = dict( zip( all()['topic name'], all().index ) )[row]
elif isinstance( row, int ):
self.index = row
else:
raise TypeError( f"Topics cannot be constructed from a {type(row)}" )
self._row = all().iloc[self.index,:]
self._tasks = None
# Getters for all columns in the tasks df
@property
def topic_name ( self ):
return self._row['topic name']
@property
def topic_filename ( self ):
return self._row['topic filename']
@property
def permalink ( self ):
return self._row['permalink']
@property
def content ( self ):
return self._row['content']
@property
def raw_content ( self ):
return self._row['raw content']
@property
def author ( self ):
return self._row['author']
@property
def markdown_link ( self ):
return self._row['markdown link']
# And a getter for the whole row
@property
def row ( self ):
return self._row
# And for its online link in the live website
@property
def url ( self ):
return config.site_link( self.permalink )
# Same as self.content, but with all task names converted to Markdown links
def content_with_links ( self ):
return tasks.make_links( self.content )
# List of all Tasks that appear in this topic, in the order that they appear
# This may be slow, so the result is cached.
def tasks ( self ):
if self._tasks is None:
content = self.content_with_links()
is_relevant = lambda link: link in content
my_position = lambda link: content.index( link )
result = tasks.all()[tasks.all().permalink.apply( is_relevant )].copy()
result['where appears'] = result.permalink.apply( my_position )
self._tasks = result.sort_values( by='where appears' )
return self._tasks
# Get a copy of the tasks() table, with an extra boolean column at the end, called "included".
# Values in that column are true iff the corresponding row has a solution in the given software and libraries.
# TO BE CLEAR: This function returns THE SAME number of rows as tasks(), but just marks them as included or
# not. This gives you the opportunity to filter or loop over the collection with greater flexibility.
# Only one solution per task will be marked as included for any given task and software pair,
# the last one whose required libraries are a subset of the ones specified in the final parameter.
def tasks_for ( self, software, libraries='software' ):
result = self.tasks().copy()
result['included'] = [
tasks.Task( row ).first_solution_using( software, libraries ) is not None
for _, row in result.iterrows()
]
return result
# Title of a PDF for this topic, given the software the PDF will focus on.
# This doubles as the filename for the PDF as well.
def pdf_title ( self, package ):
return f'{self.topic_name} in {package}'
# Full path to the output PDF for this topic, given the software the PDF will focus on
# The final parameter can be used to specify a different "main folder" than the default from config.
def pdf_outfile ( self, package, folder=config.jekyll_input_folder ):
return os.path.join( folder, 'assets', 'downloads',
self.pdf_title( package ) + '.pdf' )
# List of all PDF files that have been generated for this topic.
# Each will be the full path to the file, as generated by pdf_outfile().
# Only files currently on disk will be returned.
def existing_pdf_files ( self, folder=config.jekyll_input_folder ):
return glob.glob( self.pdf_outfile( '*', folder ) )
# Converts a PDF filename (with or without a path) into an URL in the published site
# for downloading that file
def pdf_url ( filename ):
return config.site_link( os.path.join(
'assets', 'downloads', os.path.basename( filename ) ) )
# Whether we must rebuild the PDF for this topic-software-library triple,
# based on the timestamps of all relevant files on disk.
# The final parameter can be used to specify a different "main folder" than the default from config.
def must_build_pdf ( self, package, libraries='solution', folder=config.main_folder ):
outfile = self.pdf_outfile( package )
if not os.path.exists( outfile ):
log.info( f'Rebuilding because DNE {outfile}' )
return True
pdf_last_modified = os.path.getmtime( outfile )
topic_last_modified = os.path.getmtime( self.topic_filename )
if topic_last_modified > pdf_last_modified:
log.info( f'Rebuilding because newer {self.topic_filename}' )
return True
solutions_involved = [
tasks.Task( row ).first_solution_using( package, libraries )
for _, row in self.tasks().iterrows()
]
solutions_involved = [ sol for sol in solutions_involved if sol is not None ]
for solution in solutions_involved:
task = solution.task()
task_last_modified = os.path.getmtime( task.task_filename )
if task_last_modified > pdf_last_modified:
log.info( f'Rebuilding because newer {task.task_filename}' )
return True
sol_last_modified = os.path.getmtime( solution.output_file() )
if sol_last_modified > pdf_last_modified:
log.info( f'Rebuilding because newer {solution.output_file()}' )
return True
return False
# The TOC and description for a PDF built from this topic, in the given software and libraries,
# as a Markdown string
def pdf_header ( self, package ):
return static_files.fill_template( 'topic-pdf',
TITLE = self.pdf_title( package ),
SITE_URL = config.site_url,
DATE = datetime.now().strftime("%d %B %Y"),
DESCRIPTION = self.content_with_links() )
# The portion of the PDF body that goes with the given task.
def pdf_one_solution ( self, task, package, libraries='solution', temp_folder=config.topics_folder ):
solution = task.first_solution_using( package, libraries )
if solution is not None:
solution_text = markdown.html_sections_to_latex( markdown.unescape_for_jekyll(
solution.generated_body() ), temp_folder )
solution_name = _pair_to_title( solution.software, solution.solution_name )
else:
solution_text = f'How to Data does not yet contain a solution for this task in {package}.'
solution_name = package
return static_files.fill_template( 'topic-pdf-solution',
TASK = task.task_name,
DESCRIPTION = tasks.make_links( task.content ),
SOFTWARE = solution_name,
SOLUTION = solution_text )
# All the markdown content for generating the PDF for this task in the given software and libraries
def build_pdf_text ( self, package, libraries='solution', temp_folder=config.topics_folder ):
result = self.pdf_header( package )
df = self.tasks()
for index, task_row in df.iterrows():
task = tasks.Task( task_row )
result += self.pdf_one_solution( task, package, libraries, temp_folder )
# fix all hyperlinks to be either within-the-PDF links or marked as external
def process_one_link ( match ):
text = match.group( 1 )
href = match.group( 2 )
if href[:3] == '../':
if href[3:] in df.permalink.to_list():
href = f'#{href[3:]}'
else:
href = f'{config.site_link(href[3:])}'
text = f'{text} (on website)'
elif href[0] == '.':
log.warning( 'Bad external URL:', f'[{text}]({href})' )
return f'[{text}]({href})'
return re.sub( '(?<!\\!)\\[([^]]*)\\]\\(([^)]*)\\)', process_one_link,
result, flags=re.IGNORECASE )
# Build the markdown content this topic needs for the PDF for the given software and libraries,
# save that markdown to a temp file, and compile it into a PDF. Then delete the temp file.
# The parameters are passed directly to build_text().
def build_pdf_file (
self, package, libraries='solution',
temp_folder=config.topics_folder, out_folder=config.jekyll_input_folder,
main_folder=config.main_folder
):
outfile = self.pdf_outfile( package, out_folder )
tmp_md_doc = os.path.join( temp_folder, 'pandoc-temp-file.md' )
markdown.write( tmp_md_doc,
self.build_pdf_text( package, libraries, temp_folder ), add_escapes=False )
command_to_run = 'pandoc --from=markdown --to=pdf --pdf-engine=xelatex' \
+ ' -V geometry:margin=1in -V urlcolor:NavyBlue --standalone' \
+ f' --include-in-header="{os.path.join(main_folder,"pandoc-latex-header.tex")}"' \
+ f' --lua-filter="{os.path.join(main_folder,"pandoc-pdf-tweaks.lua")}"' \
+ f' --output="{outfile}" "{tmp_md_doc}"'
shell.run_or_halt( command_to_run, f'rm "{tmp_md_doc}"' )
log.built( "PDF", outfile )
return self.pdf_title( package ) + '.pdf'
# Build all PDFs that make sense for this topic and return the list of their filenames.
# This function need not be called by clients; it is called as part of the build_file() routine.
# Note that this will create one PDF per software package, and it will choose a set of libraries
# for that software package that (1) maximizes the number of tasks from this topic that will
# be solved and within that (2) minimizes the number of libraries included.
def build_all_pdf_files (
self, min_proportion=0.5, temp_folder=config.topics_folder, out_folder=config.jekyll_input_folder,
main_folder=config.main_folder
):
result = [ ]
# loop through all software packages
for sw_name in software.all()['name']:
sw = software.Software( sw_name )
best_lib_set = None
best_num_sols = 0
max_num_sols = 0
# consider every possible subset of the libraries for it in our database
# counting from the smallest sets upwards in size
libraries = sw.all_libraries()
for size in range(len(libraries)+1):
for lib_subset in itertools.combinations( libraries, size ):
# record which of them has the best coverage (most # solutions it can handle)
# and because we are counting upwards in size, we will retain the smallest
# subset (or one tied for smallest) that achieves that maximum
tasks_for_subset = self.tasks_for( sw_name, software.libs_set_to_str( lib_subset ) )
max_num_sols = len( tasks_for_subset )
num_sols = sum( tasks_for_subset['included'] )
if num_sols > best_num_sols:
best_num_sols = num_sols
best_lib_set = lib_subset
# if the coverage of the best one we found is not sufficient, stop here
proportion = best_num_sols / max_num_sols
if proportion < min_proportion:
log.not_built( self.pdf_outfile( sw_name, out_folder ),
Reason="Not enough solutions available",
Percentage=f"{proportion*100:0.1f}%" )
continue
# it is sufficient, so we can build a PDF for this software package (and this topic).
# but should we? depends on file timestamps, so we check that now.
libraries = software.libs_set_to_str( best_lib_set )
if not self.must_build_pdf( sw_name, libraries ):
log.not_built( self.pdf_outfile( sw_name, out_folder ),
Reason="Already up to date" )
result.append( self.pdf_title( sw_name ) )
continue
# okay we can actually build this PDF! Do so...
self.build_pdf_file( sw_name, libraries, temp_folder, out_folder, main_folder )
result.append( self.pdf_title( sw_name ) )
return result
# Must this topic be rebuilt? Right now, we always rebuild these because it's easy.
# The second parameter would be passed directly to output_file() if we later upgraded
# this function to do any file-based comparisons.
def must_build ( self, folder=None ):
return True
# Build the markdown content this topic generates but just return it as a string.
# This necessitates first building any PDFs that will be linked to from this markdown,
# and including links to them herein.
def build_text ( self ):
pdf_files = self.build_all_pdf_files()
pdf_downloads = ''
if len( pdf_files ) > 0:
for pdf_title in pdf_files:
link_text = pdf_title.replace( self.topic_name, 'Solutions' ) + ' (download PDF)'
pdf_filename = '../assets/downloads/' + pdf_title + '.pdf'
pdf_downloads += f' * [{link_text}]({pdf_filename})\n'
else:
pdf_downloads = 'No PDF downloads available for this topic yet.'
return static_files.fill_template( 'topic',
TITLE = self.topic_name,
PERMALINK = self.permalink,
CONTENT = self.content_with_links(),
CONTRIBUTORS = f'Contributed by {self.author}' if self.author != np.nan else '',
DOWNLOADS = pdf_downloads )
# What Markdown file does this topic generate? The result is an absolute path.
# By default, the result will be in the jekyll input folder defined in config.py.
# But you can choose a different output folder with the second argument.
def output_file ( self, folder=None ):
if folder is None:
folder = config.jekyll_input_folder
return os.path.join( folder, self.permalink + '.md' )
# Build the markdown content this topic generates and save it to disk.
# This necessitates first building any PDFs that will be linked to from this markdown,
# and including links to them herein.
def build_file ( self, folder=None ):
markdown_content = self.build_text()
output_file = self.output_file( folder )
markdown.write( output_file, markdown_content )
files.mark_as_regenerated( self.permalink + '.md' )
return output_file
# Run build_file() if needed.
# Otherwise, log that it wasn't needed, but mark the file as up-to-date.
def build ( self, folder=None, force=False ):
if force or self.must_build( folder ):
self.build_file( folder )
else:
log.not_built( self.output_file( folder ), self.topic_name )
files.mark_as_regenerated( self.permalink + '.md' )
# Read from disk the most recent generated markdown content for this topic.
# Parameter customizes what folder is passed to output_file().
def generated_markdown ( self, folder=None ):
return files.read_text_file( self.output_file( folder ) )