diff --git a/build/config.yaml b/build/config.yaml index c2abb38..8e321b3 100644 --- a/build/config.yaml +++ b/build/config.yaml @@ -38,10 +38,4 @@ generator: slices: - 20 - - 22 - - 31 - - 25 - - 30 - - 28 - - 27 - remainder diff --git a/build/output.html.mako b/build/output.html.mako index 29334b4..0f3b277 100644 --- a/build/output.html.mako +++ b/build/output.html.mako @@ -45,7 +45,7 @@
% for row in slice:${row[col['primary_attr']] | h} diff --git a/build/style.css b/build/style.css index d163478..e0d0fde 100644 --- a/build/style.css +++ b/build/style.css @@ -7,7 +7,8 @@ body { .page-container { width: 100vw; height: 100vh; - contain: paint; + position: relative; + /* contain: paint; */ } .flex-container { diff --git a/freq_table/__main__.py b/freq_table/__main__.py index 6c0ef90..f8d76ba 100644 --- a/freq_table/__main__.py +++ b/freq_table/__main__.py @@ -1,13 +1,8 @@ import logging import os -from argparse import ArgumentParser, RawDescriptionHelpFormatter -import yaml +from .cli import Cli -from . import utils - -# TODO: write README -# TODO: switch to OO style # TODO: auto make pdf # TODO: enable multiple templates # TODO: add record merging @@ -16,143 +11,11 @@ # TODO: improve page layout (a tricky task) # TODO: inline css? -DEFAULT_CONFIG = 'build/config.yaml' -DEFAULT_TEMPLATE = 'build/output.html.mako' -DEFAULT_OUTPUT = 'build/output.html' -DEFAULT_RECORDS = 'build/records.yaml' - -DESCRIPTION = """ -Make printable tables from http://radioscanner.ru frequency db. - -A *record* represents a frequency with associated information. -Records are identified by URL. When gathering records from multiple -sources, records with the same URl are overwritten by those processed later. - -Without arguments, the program tries to load records from the default file -falling back to scraping if that's not present. -""" - -logger = logging.getLogger(__name__) -if os.getenv('LOGLEVEL'): - level = getattr(logging, os.getenv('LOGLEVEL').upper()) - logging.basicConfig(level=level) - - -def parse_args(): - parser = ArgumentParser(prog='freq_table', description=DESCRIPTION, - formatter_class=RawDescriptionHelpFormatter, - epilog='Logs are controlled by the env var LOGLEVEL') - - parser.add_argument('-s', '--scrape', action='store_true', - help='download records from the web (before loading files)') - - parser.add_argument('-u', '--update', action='store_true', - help='scrape *after* loading files') - - parser.add_argument('-n', '--no-gen', dest='gen_html', action='store_false', - help='do not generate html output') - - parser.add_argument('-l', '--load', nargs='?', const=DEFAULT_RECORDS, - action='append', metavar='FROM_FILE', - help='load records from a file; repeat for multiple files' - ' (just "-l" defaults to %(const)s)') - - parser.add_argument('-d', '--dump', nargs='?', const=DEFAULT_RECORDS, metavar='TO_FILE', - help='save all gathered records to a file' - ' (just "-d" defaults to %(const)s)') - - parser.add_argument('-c', '--config', default=DEFAULT_CONFIG, metavar='CONF_FILE', - help='config file (default: %(default)s)') - - parser.add_argument('-o', '--output', default=DEFAULT_OUTPUT, metavar='OUT_FILE', - help='output file (default: %(default)s)') - - parser.add_argument('-t', '--template', default=DEFAULT_TEMPLATE, metavar='TMPL_FILE', - help='template file (default: %(default)s)') - - args = parser.parse_args() - - if args.update: - args.scrape = True - - if not args.load and not args.scrape: - if os.path.isfile(DEFAULT_RECORDS): - args.load = (DEFAULT_RECORDS,) - logger.warning('Using default records file.') - else: - args.scrape = True - logger.warning('No records file found, will scrape the web.') - - return parser, args - - -def add_records(store, records, from_='the web (may take a while)'): - logger.warning('Adding records from %s...', from_) - cnt = 0 - for r in records: - store[r['url']] = r - cnt += 1 - logger.warning('Added %i records.', cnt) - - -def main(): - parser, args = parse_args() - - # load config - with open(args.config) as fh: - logger.warning('Loading config from %s...', fh.name) - utils.config = yaml.safe_load(fh) - - # these imports use loaded config - from . import scraper - from . import generator - records_by_id = {} - - # scrape records from the web - if args.scrape and not args.update: - add_records(records_by_id, scraper.get_records()) - - # load records from files - if args.load: - for file in args.load: - with open(file) as fh: - records = yaml.safe_load(fh) - add_records(records_by_id, records, from_=fh.name) - - # scrape overwriting loaded records - if args.update: - add_records(records_by_id, scraper.get_records()) - - if not args.dump and not args.gen_html: - return - - if len(records_by_id) == 0: - parser.error('No records to process. Aborting.') - - logger.warning('Sorting records...') - records = sorted(records_by_id.values(), - key=lambda r: float(r['frequency'])) - logger.warning('Sorted %i records.', len(records)) - - # dump records to file - if args.dump: - with open(args.dump, 'w') as fh: - logger.warning('Writing records to %s...', fh.name) - yaml.safe_dump(records, fh, allow_unicode=True) - - # and finally... - if args.gen_html: - with open(args.template) as fh: - logger.warning('Reading template from %s...', fh.name) - tmpl = fh.read() - - with open(args.output, 'w') as fh: - logger.warning('Generating html at %s...', fh.name) - html = generator.generate_html(tmpl, records) - fh.write(html) - - logger.warning('Done.') - - if __name__ == '__main__': - main() + if os.getenv('LOGLEVEL'): + level = getattr(logging, os.getenv('LOGLEVEL').upper()) + logging.basicConfig(level=level) + + cli = Cli() + cli.parse_args() + cli.main() diff --git a/freq_table/cli.py b/freq_table/cli.py new file mode 100644 index 0000000..e637ea7 --- /dev/null +++ b/freq_table/cli.py @@ -0,0 +1,155 @@ +import logging +import os +from argparse import ArgumentParser, RawDescriptionHelpFormatter + +import yaml + +from .generator import Generator +from .records import RecordStore +from .scraper import Scraper + +DEFAULT_CONFIG = 'build/config.yaml' +DEFAULT_TEMPLATE = 'build/output.html.mako' +DEFAULT_OUTPUT = 'build/output.html' +DEFAULT_RECORDS = 'build/records.yaml' + +DESCRIPTION = """ +Make printable tables from http://radioscanner.ru frequency db. + +A *record* represents a frequency with associated information. +Records are identified by URL. When gathering records from multiple +sources, records with the same URl are overwritten by those processed later. + +Without arguments, the program tries to load records from the default file +falling back to scraping if that's not present. +""" + +logger = logging.getLogger(__name__) + + +class Cli: + def __init__(self): + self.store = RecordStore() + parser = self.parser = ArgumentParser( + prog='freq_table', description=DESCRIPTION, + formatter_class=RawDescriptionHelpFormatter, + epilog='Logs are controlled by the env var LOGLEVEL') + + parser.add_argument( + '-s', '--scrape', action='store_true', + help='download records from the web (before loading files)') + + parser.add_argument( + '-u', '--update', action='store_true', + help='scrape *after* loading files') + + parser.add_argument( + '-n', '--no-gen', dest='gen_html', action='store_false', + help='do not generate html output') + + parser.add_argument( + '-l', '--load', nargs='?', const=DEFAULT_RECORDS, + action='append', metavar='FROM_FILE', + help='load records from a file; repeat for multiple files' + ' (just "-l" defaults to %(const)s)') + + parser.add_argument( + '-d', '--dump', nargs='?', const=DEFAULT_RECORDS, metavar='TO_FILE', + help='save all gathered records to a file' + ' (just "-d" defaults to %(const)s)') + + parser.add_argument( + '-c', '--config', default=DEFAULT_CONFIG, metavar='CONF_FILE', + help='config file (default: %(default)s)') + + parser.add_argument( + '-o', '--output', default=DEFAULT_OUTPUT, metavar='OUT_FILE', + help='output file (default: %(default)s)') + + parser.add_argument( + '-t', '--template', default=DEFAULT_TEMPLATE, metavar='TMPL_FILE', + help='template file (default: %(default)s)') + + def parse_args(self, args=None): + args = self.args = self.parser.parse_args(args) + + if args.update: + args.scrape = True + + if not args.load and not args.scrape: + if os.path.isfile(DEFAULT_RECORDS): + args.load = (DEFAULT_RECORDS,) + logger.warning('Using default records file.') + else: + args.scrape = True + logger.warning('No records file found, will scrape the web.') + + def main(self): + self.load_config() + + if self.args.scrape and not self.args.update: + self.scrape_records() + + if self.args.load: + self.load_files() + + if self.args.update: + self.scrape_records() + + if not self.args.dump and not self.args.gen_html: + return + + if self.store.count() == 0: + self.parser.error('No records to process. Aborting.') + + logger.warning('Sorting records...') + records = self.store.get_sorted_by_freq() + logger.warning('Sorted %i records.', len(records)) + + if self.args.dump: + self.dump_records(records) + + # and finally... + if self.args.gen_html: + self.gen_html(records) + + logger.warning('Done.') + + def load_config(self): + with open(self.args.config) as fh: + logger.warning('Loading config from %s...', fh.name) + self.config = yaml.safe_load(fh) + + def add_records(self, records, from_): + logger.warning('Adding records from %s...', from_) + cnt = 0 + for r in records: + self.store.add(r) + cnt += 1 + logger.warning('Added %i records.', cnt) + + def scrape_records(self): + scraper = Scraper(self.config['scraper']) + self.add_records(scraper.get_records(), + from_='the web (may take a while)') + + def load_files(self): + for file in self.args.load: + with open(file) as fh: + self.add_records(yaml.safe_load(fh), from_=fh.name) + + def dump_records(self, records): + with open(self.args.dump, 'w') as fh: + logger.warning('Writing records to %s...', fh.name) + yaml.safe_dump(records, fh, allow_unicode=True) + + def gen_html(self, records): + with open(self.args.template) as fh: + logger.warning('Reading template from %s...', fh.name) + tmpl = fh.read() + + with open(self.args.output, 'w') as fh: + logger.warning('Generating html at %s...', fh.name) + generator = Generator(self.config['generator']) + html = generator.generate_html(tmpl, records) + fh.write(html) diff --git a/freq_table/generator.py b/freq_table/generator.py index 3a3227e..74dcd21 100644 --- a/freq_table/generator.py +++ b/freq_table/generator.py @@ -8,76 +8,76 @@ TEXT_FIELDS = ('service_type', 'affiliation', 'call_sign', 'description') MISSING = '-' -config = utils.get_config('generator') logger = logging.getLogger(__name__) class RecordSlice(list): - def __init__(self, records): + def __init__(self, records, config): super().__init__(records) self.start_freq = self[0]['frequency'] self.end_freq = self[-1]['frequency'] - logger.info('Creating slice %s:%s', self.start_freq, self.end_freq) - self.caption = config['table_caption'].format(start_freq=self.start_freq, end_freq=self.end_freq) + logger.info('Creating slice %s:%s', self.start_freq, self.end_freq) + class RecordPage(list): - def __init__(self, slices, number): + def __init__(self, slices, number, config): super().__init__(slices) self.start_freq = self[0].start_freq self.end_freq = self[-1].end_freq self.number = number - logger.info('Creating page #%i %s:%s', self.number, - self.start_freq, self.end_freq) - self.footer = config['page_footer'] + logger.info('Creating page #%i %s:%s', self.number, + self.start_freq, self.end_freq) -def preprocess_record(r): - logger.info('Processing record %s', r['url']) - - r['date'] = r['date'].strftime('%d.%m.%Y') - for f in TEXT_FIELDS: - text = r.get(f) or MISSING - r[f] = utils.hyphenate(text) +class Generator: + def __init__(self, config): + self.config = config - return r + def preprocess_record(self, r): + logger.info('Processing record %s', r['url']) + r['date'] = r['date'].strftime('%d.%m.%Y') -def split_slices(records): - for slice_len in config['slices']: - if slice_len == 'remainder': - s = RecordSlice(records) - if s: - yield s - return - else: - yield RecordSlice(islice(records, slice_len)) + for f in TEXT_FIELDS: + text = r.get(f) or MISSING + r[f] = utils.hyphenate(text) + return r -def split_pages(slices): - buf = [] - n = 1 + def split_slices(self, records): + for slice_len in self.config['slices']: + if slice_len == 'remainder': + s = RecordSlice(records, self.config) + if s: + yield s + return + else: + yield RecordSlice(islice(records, slice_len), self.config) - for s in slices: - buf.append(s) + def split_pages(self, slices): + buf = [] + n = 1 - if (len(buf) == config['columns_on_page']): - yield RecordPage(buf, n) - buf = [] - n += 1 + for s in slices: + buf.append(s) - if buf: - yield RecordPage(buf, n) + if (len(buf) == self.config['columns_on_page']): + yield RecordPage(buf, n, self.config) + buf = [] + n += 1 + if buf: + yield RecordPage(buf, n, self.config) -def generate_html(template, records): - records = map(preprocess_record, records) - slices = split_slices(records) - pages = split_pages(slices) + def generate_html(self, template, records): + records = map(self.preprocess_record, records) + slices = self.split_slices(records) + pages = self.split_pages(slices) - t = Template(template) - return t.render_unicode(data=pages, **config) + t = Template(template) + return t.render_unicode(data=pages, **self.config) diff --git a/freq_table/records.py b/freq_table/records.py new file mode 100644 index 0000000..cda1d16 --- /dev/null +++ b/freq_table/records.py @@ -0,0 +1,12 @@ +class RecordStore: + def __init__(self): + self.by_id = {} + + def count(self): + return len(self.by_id) + + def add(self, record): + self.by_id[record['url']] = record + + def get_sorted_by_freq(self): + return sorted(self.by_id.values(), key=lambda r: float(r['frequency'])) diff --git a/freq_table/scraper.py b/freq_table/scraper.py index ef4d7b4..25ddf7c 100644 --- a/freq_table/scraper.py +++ b/freq_table/scraper.py @@ -19,48 +19,49 @@ ('description', 'Описание'), ) -config = utils.get_config('scraper') logger = logging.getLogger(__name__) -def get_records(): - for page in itertools.count(): - logger.info('Fetching page %i', page) - r = requests.get(config['url'].format(page=page)) - yield from get_records_from_page(r.text) +class Scraper: + def __init__(self, config): + self.config = config - # dirty hack to determine the last page - if '">>>' not in r.text: - return + def get_records(self): + for page in itertools.count(): + logger.info('Fetching page %i', page) + r = requests.get(self.config['url'].format(page=page)) + yield from self.get_records_from_page(r.text) + # dirty hack to determine the last page + if '">>>' not in r.text: + return -def get_records_from_page(text): - soup = BeautifulSoup(text, features="html.parser") - for row in soup.find_all('tr', class_=['tbCel1', 'tbCel2']): - href = row.find('a')['href'] + def get_records_from_page(self, text): + soup = BeautifulSoup(text, features="html.parser") + for row in soup.find_all('tr', class_=['tbCel1', 'tbCel2']): + href = row.find('a')['href'] - logger.info('Fetching record %s', href) - r = requests.get(href) - rec = parse_record(r.text) + logger.info('Fetching record %s', href) + r = requests.get(href) + rec = self.parse_record(r.text) - rec['url'] = href - yield rec + rec['url'] = href + yield rec + def parse_record(self, text): + soup = BeautifulSoup(text, features="html.parser").find('body') + record = {} -def parse_record(text): - soup = BeautifulSoup(text, features="html.parser").find('body') - record = {} + for name, title in ROW_TITLES: + soup = soup.find_next(string=re.compile(title)).find_next('td') + value = soup.get_text(separator='\n', strip=True) - for name, title in ROW_TITLES: - soup = soup.find_next(string=re.compile(title)).find_next('td') - value = soup.get_text(separator='\n', strip=True) + if name == 'frequency': + value = value.partition(' ')[0] # drop the unit + elif name == 'date': + value = utils.parse_date(value) - if name == 'frequency': - value = value.partition(' ')[0] # drop the unit - elif name == 'date': - value = utils.parse_date(value) + record[name] = value - record[name] = value - - logger.debug('Parsed record: %s', record) - return record + logger.debug('Parsed record: %s', record) + return record diff --git a/freq_table/utils.py b/freq_table/utils.py index 05b8119..8a795f7 100644 --- a/freq_table/utils.py +++ b/freq_table/utils.py @@ -1,7 +1,6 @@ import re from datetime import date -import yaml from pyphen import Pyphen HYPHEN = '\u00ad' # soft hyphen () @@ -23,11 +22,6 @@ } pyphen = Pyphen(lang='ru_RU') -config = None # set by main - - -def get_config(name): - return config[name] def parse_date(date_str):