From 27f88c0e9773fb0c998b809d1a6f10d6f3123e65 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 12 Dec 2021 20:28:11 +0100 Subject: [PATCH 01/42] amazon source with locale specification for EN and DE --- beancount_import/source/amazon.py | 10 +- beancount_import/source/amazon_invoice.py | 1410 ++++++++++------- .../source/amazon_invoice_test.py | 3 +- 3 files changed, 844 insertions(+), 579 deletions(-) diff --git a/beancount_import/source/amazon.py b/beancount_import/source/amazon.py index 484ae779..d7d22f5c 100644 --- a/beancount_import/source/amazon.py +++ b/beancount_import/source/amazon.py @@ -41,6 +41,7 @@ 'Gift Card Amount': 'Assets:Gift-Cards:Amazon', 'Rewards Points': 'Income:Amazon:Cashback', }, + locale='EN' # optional, defaults to 'EN' ) The `amazon_account` key must be specified, and should be set to the email @@ -54,6 +55,9 @@ specify these keys in the configuration, the generic automatic account prediction will likely handle them. +The `locale` sets country/language specific settings. +Currently, `EN` and `DE` are available. + Specifying credit cards ======================= @@ -271,7 +275,7 @@ from beancount.core.number import ZERO, ONE import beancount.core.amount -from .amazon_invoice import parse_invoice, DigitalItem, Order +from .amazon_invoice import AmazonInvoice, DigitalItem, Order from ..matching import FIXME_ACCOUNT, SimpleInventory from ..posting_date import POSTING_DATE_KEY, POSTING_TRANSACTION_DATE_KEY @@ -539,6 +543,7 @@ def __init__(self, posttax_adjustment_accounts: Dict[str, str] = {}, pickle_dir: str = None, earliest_date: datetime.date = None, + locale='EN', **kwargs) -> None: super().__init__(**kwargs) self.directory = directory @@ -551,6 +556,7 @@ def __init__(self, self.pickler = AmazonPickler(pickle_dir) self.earliest_date = earliest_date + self.amz_inv = AmazonInvoice(locale=locale) self.invoice_filenames = [] # type: List[Tuple[str, str]] for filename in os.listdir(self.directory): @@ -570,7 +576,7 @@ def _get_invoice(self, results: SourceResults, order_id: str, invoice_filename: invoice = self.pickler.load(results, invoice_path) # type: Optional[Order] if invoice is None: self.log_status('amazon: processing %s: %s' % (order_id, invoice_path, )) - invoice = parse_invoice(invoice_path) + invoice = self.amz_inv.parse_invoice(invoice_path) self.pickler.dump( results, invoice_path, invoice ) self._cached_invoices[invoice_filename] = invoice, invoice_path diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 7ce2faa4..579a6a16 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -1,4 +1,4 @@ -"""Parses an Amazon.com regular or digital order details HTML file.""" +"""Parses an Amazon.com/.de regular or digital order details HTML file.""" from typing import NamedTuple, Optional, List, Union, Iterable, Dict, Sequence, cast import collections @@ -6,6 +6,7 @@ import os import functools import datetime +import logging import bs4 import dateutil.parser @@ -15,6 +16,193 @@ from ..amount_parsing import parse_amount, parse_number +logger = logging.getLogger('amazon') + + +class Locale_EN(): + LOCALE = 'EN' + tax_included_in_price = False + shipped_pattern = '^Shipped on ([^\\n]+)$' + nonshipped_headers = { + 'Service completed', + 'Preparing for Shipment', + 'Not Yet Shipped', + 'Shipping now' + } + items_ordered = 'Items Ordered' + price = 'Price' + currency = 'USD' + of = 'of:' + seller_profile = ' (seller profile)' + items_subtotal_regex = r'Item\(s\) Subtotal:' + total_before_tax_regex = 'Total Before Tax:' + sales_tax_shipment = 'Sales Tax:' + total_shipment = 'Total for This Shipment:' + + pattern_without_condition = r'(?P.*)\n\s*(?:Sold|Provided) by:? (?P[^\n]+)' + pattern_with_condition = pattern_without_condition + r'\n.*\n\s*Condition: (?P[^\n]+)' + + # Payment Table & Credit Card Transactions + grand_total_regex = r'\n\s*Grand Total:\s+(.*)\n' + credit_card_transactions = 'Credit Card transactions' + last_digits_regex = r'^([^:]+) ending in ([0-9]+):\s+([^:]+):$' + payment_type_regexes = [ + # only first matching regex is used! + r'\n\s*([^\s|][^|\n]*[^|\s])\s+\|\s+Last (?:4 )?digits:\s+([0-9]{4})\n', + r'\n\s*(.+)\s+ending in\s+([0-9]{4})\n' + ] + payment_information = '^Payment information$' + grand_total = 'Grand Total:' + + # Page Header + order_placed_regex = r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})' + order_id_regular = r'.*Order ([0-9\-]+)' + + # digital invoice + order_cancelled = 'Order Canceled' + digital_order = 'Digital Order: (.*)' + by = 'By' + sold_by = r'Sold\s+By' + tax_collected_digital = 'Tax Collected:' + estimated_tax = 'Estimated tax to be collected:' + total_order_digital = 'Total for this Order:' + order_id_digital = '^Amazon.com\\s+order number:\\s+(D[0-9-]+)$' + payment_information_digital = 'Payment Information' + + pretax_adjustment_fields_pattern = ('(?:' + '|'.join([ + 'Shipping & Handling', # Verpackung & Versand: + 'Free Shipping', + 'Free delivery', + 'Pantry delivery', + 'Promotion(?:s| Applied)', # Gutschein eingelöst: + 'Lightning Deal', + 'Your Coupon Savings', + '[0-9]+% off savings', + 'Subscribe & Save', + '[0-9]+ Audible Credit Applied', + '.*[0-9]+% Off.*', + 'Courtesy Credit', + 'Extra Savings', + '(?:.*) Discount', + 'Gift[ -]Wrap', + ]) + ') *:') + posttax_adjustment_fields_pattern = r'Gift Card Amount:|Rewards Points:|Tip [(]optional[)]:|Recycle Fee \$X' + + @staticmethod + def parse_amount(amount, assumed_currency=None) -> Amount: + return parse_amount(amount, assumed_currency=assumed_currency) + + @staticmethod + def parse_date(date_str) -> str: + return dateutil.parser.parse(date_str).date() + + +class Locale_DE(): + """Language and region specific settings for parsing amazon.de invoices + """ + LOCALE = 'DE' + tax_included_in_price = True # no separate tax transactions + shipped_pattern = '^versandt am ([^\\n]+)$' + nonshipped_headers = { # Translations missing + 'Service completed', + 'Preparing for Shipment', + 'Not Yet Shipped', + 'Shipping now' + } + items_ordered = 'Bestellte Artikel' + price = 'Preis' + currency = 'EUR' + of = 'Exemplar(e) von:' + seller_profile = ' (Mitgliedsprofil)' + items_subtotal_regex = 'Zwischensumme:' + total_before_tax_regex = 'Summe ohne MwSt.:' + sales_tax_shipment = 'Anzurechnende MwSt.:' # not sure (only old invoices) + total_shipment = 'Gesamtsumme:' + + pattern_without_condition = r'(?P.*)\n\s*(?:Verkauf|Provided) durch:? (?P[^\n]+)' + # Provided by: Translation missing + pattern_with_condition = pattern_without_condition + r'\n.*\n\s*Zustand: (?P[^\n]+)' + + # Payment Table & Credit Card Transactions + grand_total_regex = r'\n\s*(?:Gesamtsumme|Endsumme):\s+(.*)\n' # regular: Gesamtsumme, digital: Endsumme + credit_card_transactions = 'Kreditkarten-Transaktionen' + last_digits_regex = r'^([^:]+) mit den Endziffern ([0-9]+):\s+([^:]+):$' + payment_type_regexes = [ + # only first matching regex is used! + r'\n\s*([^\s|][^|\n]*[^|\s])\s+\|\s+Die letzten (?:4 )?Ziffern:\s+([0-9]{3,4})\n', # 3 digits for Bankeinzug + r'\n\s*(.+)\s+mit den Endziffern\s+([0-9]{4})\n' + ] + payment_information = '^Zahlungsdaten$' + grand_total = 'Gesamtsumme:' + + # Page Header + order_placed_regex = r'(?:Subscribe and Save )?Bestellung aufgegeben am:\s+(\d+\. [^\s]+ \d{4})' + # Translation missing: Subscribe and Save -> Sparabo?? + order_id_regular = r'.*Bestellung ([0-9\-]+)' + + # digital invoice + order_cancelled = 'Order Canceled' + digital_order = 'Digitale Bestellung: (.*)' + by = 'Von' + sold_by = r'Verkauft von' + tax_collected_digital = 'MwSt:' + estimated_tax = 'Anzurechnende MwSt.:' + total_order_digital = 'Endsumme:' + order_id_digital = '^Amazon.de\\s+Bestellnummer:\\s+(D[0-9-]+)$' + payment_information_digital = 'Zahlungsinformation' + + # most of translations still missing ... + pretax_adjustment_fields_pattern = ('(?:' + '|'.join([ + 'Verpackung & Versand', + 'Free Shipping', + 'Free delivery', + 'Pantry delivery', + 'Gutschein eingelöst', # english version not removed yet + 'Promotion(?:s| Applied)', + 'Lightning Deal', + 'Your Coupon Savings', + '[0-9]+% off savings', + 'Subscribe & Save', + '[0-9]+ Audible Credit Applied', + '.*[0-9]+% Off.*', + 'Courtesy Credit', + 'Extra Savings', + '(?:.*) Discount', + 'Gift[ -]Wrap', + ]) + ') *:') + posttax_adjustment_fields_pattern = r'Gift Card Amount:|Rewards Points:|Tip [(]optional[)]:|Recycle Fee \$X' + + + @staticmethod + def _format_number_str(value: str) -> str: + # 12.345,67 EUR -> 12345.67 EUR + thousands_sep = '.' + decimal_sep = ',' + return value.replace(thousands_sep, '').replace(decimal_sep, '.') + + @staticmethod + def parse_amount(amount: str, assumed_currency=None) -> Amount: + if amount is None: + return None + else: + return parse_amount( + Locale_DE._format_number_str(amount), + assumed_currency=assumed_currency) + + class _parserinfo(dateutil.parser.parserinfo): + MONTHS=[ + ('Jan', 'Januar'), ('Feb', 'Februar'), ('Mär', 'März'), + ('Apr', 'April'), ('Mai', 'Mai'), ('Jun', 'Juni'), + ('Jul', 'Juli'), ('Aug', 'August'), ('Sep', 'September'), + ('Okt', 'Oktober'), ('Nov', 'November'), ('Dez', 'Dezember') + ] + + @staticmethod + def parse_date(date_str) -> str: + return dateutil.parser.parse(date_str, parserinfo=Locale_DE._parserinfo(dayfirst=True)).date() + + +LOCALE = {x.LOCALE : x for x in [Locale_EN, Locale_DE]} Errors = List[str] Adjustment = NamedTuple('Adjustment', [ @@ -65,26 +253,6 @@ ('errors', Errors), ]) -pretax_adjustment_fields_pattern = ('(?:' + '|'.join([ - 'Shipping & Handling', - 'Free Shipping', - 'Free delivery', - 'Pantry delivery', - 'Promotion(?:s| Applied)', - 'Lightning Deal', - 'Your Coupon Savings', - '[0-9]+% off savings', - 'Subscribe & Save', - '[0-9]+ Audible Credit Applied', - '.*[0-9]+% Off.*', - 'Courtesy Credit', - 'Extra Savings', - '(?:.*) Discount', - 'Gift[ -]Wrap', -]) + ') *:') -posttax_adjustment_fields_pattern = r'Gift Card Amount:|Rewards Points:|Tip [(]optional[)]:|Recycle Fee \$X' - - def to_json(obj): if hasattr(obj, '_asdict'): return to_json(obj._asdict()) @@ -99,607 +267,692 @@ def to_json(obj): return obj -def add_amount(a: Optional[Amount], b: Optional[Amount]) -> Optional[Amount]: - if a is None: - return b - if b is None: - return a - return beancount.core.amount.add(a, b) - - -def reduce_amounts(amounts: Iterable[Amount]) -> Optional[Amount]: - return functools.reduce(add_amount, amounts, None) - - -def get_field_in_table(table, pattern, allow_multiple=False, - return_label=False): - def predicate(node): - return node.name == 'td' and re.fullmatch(pattern, node.text.strip(), - re.I) is not None +class AmazonInvoice(): + def __init__(self, locale='EN'): + self.locale = LOCALE[locale] + + @staticmethod + def add_amount(a: Optional[Amount], b: Optional[Amount]) -> Optional[Amount]: + """Add two amounts, amounts with value `None` are ignored. + """ + if a is None: + return b + if b is None: + return a + return beancount.core.amount.add(a, b) + + @staticmethod + def reduce_amounts(amounts: Iterable[Amount]) -> Optional[Amount]: + """Reduce iterable of amounts to sum by applying `add_amount`. + """ + return functools.reduce(AmazonInvoice.add_amount, amounts, None) + + @staticmethod + def get_field_in_table(table, pattern, allow_multiple=False, + return_label=False): + def predicate(node): + return node.name == 'td' and re.fullmatch(pattern, node.text.strip(), + re.I) is not None + + tds = table.find_all(predicate) + results = [(td.text.strip().strip(':'), + td.find_next_sibling('td').text.strip()) for td in tds] + if not return_label: + results = [r[1] for r in results] + if not allow_multiple: + if not results: + return None + return results[0] + return results - tds = table.find_all(predicate) - results = [(td.text.strip().strip(':'), - td.find_next_sibling('td').text.strip()) for td in tds] - if not return_label: - results = [r[1] for r in results] - if not allow_multiple: - if not results: - return None - return results[0] - return results + def get_adjustments_in_table(self, table, pattern, assumed_currency=None): + adjustments = [] + for label, amount_str in AmazonInvoice.get_field_in_table( + table, pattern, allow_multiple=True, return_label=True): + adjustments.append( + Adjustment(amount=self.locale.parse_amount(amount_str, assumed_currency), + description=label)) + return adjustments + @staticmethod + def reduce_adjustments(adjustments: List[Adjustment]) -> List[Adjustment]: + all_adjustments = collections.OrderedDict() # type: Dict[str, List[Amount]] + for adjustment in adjustments: + all_adjustments.setdefault(adjustment.description, + []).append(adjustment.amount) + return [ + Adjustment(k, AmazonInvoice.reduce_amounts(v)) for k, v in all_adjustments.items() + ] -def get_adjustments_in_table(table, pattern, assumed_currency=None): - adjustments = [] - for label, amount_str in get_field_in_table( - table, pattern, allow_multiple=True, return_label=True): - adjustments.append( - Adjustment(amount=parse_amount(amount_str, assumed_currency), - description=label)) - return adjustments + def parse_shipments(self, soup) -> List[Shipment]: + """ + Parses Shipment Table Part of HTML document (1st Table) + """ -def reduce_adjustments(adjustments: List[Adjustment]) -> List[Adjustment]: - all_adjustments = collections.OrderedDict() # type: Dict[str, List[Amount]] - for adjustment in adjustments: - all_adjustments.setdefault(adjustment.description, - []).append(adjustment.amount) - return [ - Adjustment(k, reduce_amounts(v)) for k, v in all_adjustments.items() - ] + # shipped_pattern = '^Shipped on ([^\\n]+)$' + # # versandt am 27. September 2021 + # # Shipped on February 8, 2016 + # nonshipped_headers = { + # 'Service completed', + # 'Preparing for Shipment', + # 'Not Yet Shipped', + # 'Shipping now' + # } + def is_shipment_header_table(node): + if node.name != 'table': + return False + text = node.text.strip() + m = re.match(self.locale.shipped_pattern, text) + return m is not None or text in self.locale.nonshipped_headers + + header_tables = soup.find_all(is_shipment_header_table) + + shipments = [] # type: List[Shipment] + errors = [] # type: Errors + + for header_table in header_tables: + text = header_table.text.strip() + shipped_date = None + if text not in self.locale.nonshipped_headers: + m = re.match(self.locale.shipped_pattern, text) + assert m is not None + shipped_date = self.locale.parse_date(m.group(1)) + + items = [] + + shipment_table = header_table.find_parent('table') + + def is_items_ordered_header(node): + if node.name != 'tr': + return False + tds = node('td') + if len(tds) < 2: + return False + return (tds[0].text.strip() == self.locale.items_ordered and + tds[1].text.strip() == self.locale.price) + # Items Ordered + # Bestellte Artikel + # Price + # Preis + + items_ordered_header = shipment_table.find(is_items_ordered_header) + + item_rows = items_ordered_header.find_next_siblings('tr') + + logger.info('Parsing Shipment Items') + for item_row in item_rows: + tds = item_row('td') + description_node = tds[0] + price_node = tds[1] + price = price_node.text.strip() + + if price is None: + price = Amount(D(0), self.locale.currency) + # EUR 16,99 + # $11.87 + else: + price = self.locale.parse_amount(price) + + # 1 of: 365 Everyday Value, Potato Yellow Bag Organic, 48 Ounce + # 2 (1.04 lb) of: Broccoli Crowns Conventional, 1 Each + # 2.07 lb of: Pork Sausage Link Italian Mild Step 1 + + pattern_quantity = r'^\s*(?:(?P[0-9]+)|(?P[0-9.]+\s+(?:lb|kg))|(?:(?P[0-9.]+) [(](?P[^)]+)[)]))\s+of:' + # ToDo: check if this matches all locales, e.g. 'of' and units + m = re.match(pattern_quantity, description_node.text, re.UNICODE|re.DOTALL) + quantity = 1 + if m is not None: + # Amazon will say you got, e.g. 2 broccoli crowns at $1.69/lb - but then this code multiplies the 2 by the price listed + # on the invoice, which is the total price in this case (but the per-unit price in other cases) - so if there's a quantity + # and a weight, ignore the quantity and treat it as 1 + # alternately, capture the weight and the per-unit price and multiply out + quantity = m.group("quantity") # ignore quantity for weight items + + if quantity is None: + #print("Unable to extract quantity, using 1: %s" % description_node.text) + quantity = D(1) + else: + quantity = D(quantity) + + text = description_node.text.split(self.locale.of, 1)[1] + # Übersetzung fehlt + + m = re.match(self.locale.pattern_with_condition, text, re.UNICODE | re.DOTALL) + if m is None: + m = re.match(self.locale.pattern_without_condition, text, re.UNICODE | re.DOTALL) + if m is None: + raise Exception("Could not extract item from row", text) + + description = re.sub(r'\s+', ' ', m.group('description').strip()) + sold_by = re.sub(r'\s+', ' ', m.group('sold_by').strip()) + try: + condition = re.sub(r'\s+', ' ', m.group('condition').strip()) + except IndexError: + condition = None + suffix = self.locale.seller_profile + if sold_by.endswith(suffix): + sold_by = sold_by[:-len(suffix)] + items.append( + Item( + quantity=quantity, + description=description, + sold_by=sold_by, + condition=condition, + price=price, + )) + + logger.info('Parsing Shipment Amounts') + items_subtotal = self.locale.parse_amount( + self.get_field_in_table(shipment_table, self.locale.items_subtotal_regex)) + + expected_items_subtotal = self.reduce_amounts( + beancount.core.amount.mul(x.price, D(x.quantity)) for x in items) + if (items_subtotal is not None and + expected_items_subtotal != items_subtotal): + errors.append( + 'expected items subtotal is %r, but parsed value is %r' % + (expected_items_subtotal, items_subtotal)) + + output_fields = dict() + output_fields['pretax_adjustments'] = self.get_adjustments_in_table( + shipment_table, self.locale.pretax_adjustment_fields_pattern) + output_fields['posttax_adjustments'] = self.get_adjustments_in_table( + shipment_table, self.locale.posttax_adjustment_fields_pattern) + pretax_parts = [items_subtotal or expected_items_subtotal] + [ + a.amount for a in output_fields['pretax_adjustments'] + ] + total_before_tax = self.locale.parse_amount( + self.get_field_in_table(shipment_table, self.locale.total_before_tax_regex)) + expected_total_before_tax = self.reduce_amounts(pretax_parts) + if total_before_tax is None: + total_before_tax = expected_total_before_tax + elif expected_total_before_tax != total_before_tax: + errors.append( + 'expected total before tax is %s, but parsed value is %s' % + (expected_total_before_tax, total_before_tax)) + + sales_tax = self.get_adjustments_in_table(shipment_table, self.locale.sales_tax_shipment) + # Sales Tax: + # Anzurechnende MwSt.: + + posttax_parts = ( + [total_before_tax] + [a.amount for a in sales_tax] + + [a.amount for a in output_fields['posttax_adjustments']]) + total = self.locale.parse_amount( + self.get_field_in_table(shipment_table, self.locale.total_shipment)) + # Total for This Shipment: + # Gesamtsumme: + expected_total = self.reduce_amounts(posttax_parts) + if total is None: + total = expected_total + elif expected_total != total: + errors.append('expected total is %s, but parsed value is %s' % + (expected_total, total)) + + shipments.append( + Shipment( + shipped_date=shipped_date, + items=items, + items_subtotal=items_subtotal, + total_before_tax=total_before_tax, + tax=sales_tax, + total=total, + errors=errors, + **output_fields)) + + return shipments + + + def parse_credit_card_transactions_from_payments_table( + self, + payment_table, + order_date: datetime.date) -> List[CreditCardTransaction]: + """ Parse payment information from payments table. + Only type and last digits are given, no amount (assuming grand total). + Other payment methods than credit card are possible: + - Direct Debit (DE: Bankeinzug) + """ + payment_text = '\n'.join(payment_table.strings) + m = re.search(self.locale.grand_total_regex, payment_text) + assert m is not None + grand_total = self.locale.parse_amount(m.group(1).strip()) -def parse_shipments(soup) -> List[Shipment]: + for regex in self.locale.payment_type_regexes: + m = re.search(regex, payment_text) + if m is not None: + break + + # m = re.search(self.locale.last_digits_regex1, payment_text) + # if m is None: + # m = re.search(self.locale.last_digits_regex2, payment_text) + + if m is not None: + credit_card_transactions = [ + CreditCardTransaction( + date=order_date, + amount=grand_total, + card_description=m.group(1).strip(), + card_ending_in=m.group(2).strip(), + ) + ] + else: + credit_card_transactions = [] + return credit_card_transactions + + + def parse_credit_card_transactions(self, soup) -> List[CreditCardTransaction]: + """ Parse Credit Card Transactions from bottom sub-table of payments table. + Transactions are listed with type, 4 digits, transaction date and amount. + """ + def is_header_node(node): + return node.name == 'td' and node.text.strip( + ) == self.locale.credit_card_transactions + + header_node = soup.find(is_header_node) + if header_node is None: + return [] + sibling = header_node.find_next_sibling('td') + rows = sibling.find_all('tr') + transactions = [] + for row in rows: + if not row.text.strip(): + continue + tds = row('td') + description = tds[0].text.strip() + amount_text = tds[1].text.strip() + m = re.match(self.locale.last_digits_regex, description, + re.UNICODE) + assert m is not None + transactions.append( + CreditCardTransaction( + date=self.locale.parse_date(m.group(3)), + card_description=m.group(1), + card_ending_in=m.group(2), + amount=self.locale.parse_amount(amount_text), + )) + return transactions + + + def parse_invoice(self, path: str) -> Optional[Order]: + """ 1st method to call, distinguish between regular and digital invoice. + """ + if os.path.basename(path).startswith('D'): + logger.info('identified as digital invoice') + return self.parse_digital_order_invoice(path) + logger.info('identified as regular invoice') + return self.parse_regular_order_invoice(path) + + + def parse_regular_order_invoice(self, path: str) -> Order: + errors = [] + with open(path, 'rb') as f: + soup = bs4.BeautifulSoup(f.read(), 'lxml') + logger.info('parsing shipments...') + shipments = self.parse_shipments(soup) + logger.info('finished parsing shipments') + logger.info('parsing payment table...') + payment_table_header = soup.find( + lambda node: node.name == 'table' and re.match( + self.locale.payment_information, node.text.strip())) + + payment_table = payment_table_header.find_parent('table') + + logger.debug('parsing pretax adjustments...') + output_fields = dict() + output_fields['pretax_adjustments'] = self.get_adjustments_in_table( + payment_table, self.locale.pretax_adjustment_fields_pattern) + payment_adjustments = collections.OrderedDict() # type: Dict[str, Amount] + + # older invoices put pre-tax amounts on a per-shipment basis + # new invoices only put pre-tax amounts on the overall payments section + # detect which this is + pretax_amount = self.reduce_amounts( + a.amount for a in output_fields['pretax_adjustments']) + shipments_pretax_amount = None + + if any(s.pretax_adjustments for s in shipments): + shipments_pretax_amount = self.reduce_amounts(a.amount + for shipment in shipments + for a in shipment.pretax_adjustments) + + if shipments_pretax_amount != pretax_amount: + errors.append( + 'expected total pretax adjustment to be %s, but parsed total is %s' + % (shipments_pretax_amount, pretax_amount)) + + + logger.debug('parsing posttax adjustments...') + payments_total_adjustments = [] + shipments_total_adjustments = [] + + # parse first to get an idea of the working currency + grand_total = self.locale.parse_amount( + self.get_field_in_table(payment_table, self.locale.grand_total)) + + def resolve_posttax_adjustments(): + payment_adjustments.update( + self.reduce_adjustments( + self.get_adjustments_in_table(payment_table, + self.locale.posttax_adjustment_fields_pattern, + assumed_currency=grand_total.currency))) + all_shipments_adjustments = collections.OrderedDict( + self.reduce_adjustments( + sum((x.posttax_adjustments for x in shipments), []))) + all_keys = collections.OrderedDict(payment_adjustments.items()) + all_keys.update(all_shipments_adjustments.items()) + + all_adjustments = collections.OrderedDict() # type: Dict[str, Amount] + for key in all_keys: + payment_amount = payment_adjustments.get(key) + shipments_amount = all_shipments_adjustments.get(key) + amount = payment_amount + if payment_amount is None and shipments_amount is not None: + # Amazon sometimes doesn't include adjustments in the Payments table + amount = shipments_amount + payments_total_adjustments.append(amount) + elif payment_amount is not None and shipments_amount is None: + # Amazon sometimes doesn't include these adjustments in the Shipment table + shipments_total_adjustments.append(amount) + elif payment_amount != shipments_amount: + errors.append( + 'expected total %r to be %s, but parsed total is %s' % + (key, shipments_amount, payment_amount)) + all_adjustments[key] = amount + return [Adjustment(k, v) for k, v in all_adjustments.items()] + + output_fields['posttax_adjustments'] = resolve_posttax_adjustments() + + logger.debug('consistency check taxes...') + tax = self.locale.parse_amount( + self.get_field_in_table(payment_table, self.locale.estimated_tax)) + + expected_tax = self.reduce_amounts( + a.amount for shipment in shipments for a in shipment.tax) + if expected_tax is None: + # tax not given on shipment level + if not self.locale.tax_included_in_price: + # add tax if not already included in item prices + shipments_total_adjustments.append(tax) + elif expected_tax != tax: + errors.append( + 'expected tax is %s, but parsed value is %s' % (expected_tax, tax)) + + if self.locale.tax_included_in_price: + # tax is already inlcuded in item prices + # do not add additional transaction for taxes + tax = None + + logger.debug('consistency check grand total...') + payments_total_adjustment = self.reduce_amounts(payments_total_adjustments) + shipments_total_adjustment = self.reduce_amounts(shipments_total_adjustments) + + expected_total = self.add_amount(shipments_total_adjustment, + self.reduce_amounts(x.total for x in shipments)) + + # if no shipments pre-tax section, then the expected total isn't accounting + # for the pre-tax adjustments yet since they are only in the grand total section + if shipments_pretax_amount is None: + expected_total = self.add_amount(expected_total, pretax_amount) + + adjusted_grand_total = self.add_amount(payments_total_adjustment, grand_total) + if expected_total != adjusted_grand_total: + errors.append('expected grand total is %s, but parsed value is %s' % + (expected_total, adjusted_grand_total)) + + logger.debug('parsing order placed date...') + def is_order_placed_node(node): + m = re.fullmatch(self.locale.order_placed_regex, node.text.strip()) + return m is not None + + node = soup.find(is_order_placed_node) + m = re.fullmatch(self.locale.order_placed_regex, node.text.strip()) + assert m is not None + order_date = self.locale.parse_date(m.group(1)) + + logger.debug('parsing credit card transactions...') + credit_card_transactions = self.parse_credit_card_transactions(soup) + if not credit_card_transactions: + logger.debug('no credit card transactions table given, falling back to payments table') + credit_card_transactions = self.parse_credit_card_transactions_from_payments_table( + payment_table, order_date) + + if credit_card_transactions: + total_payments = self.reduce_amounts( + x.amount for x in credit_card_transactions) + else: + logger.info('no payment transactions found, assumig grand total as total payment amount') + total_payments = grand_total + if total_payments != adjusted_grand_total: + errors.append('total payment amount is %s, but grand total is %s' % + (total_payments, adjusted_grand_total)) + + logger.debug('parsing order ID...') + title = soup.find('title').text.strip() + m = re.fullmatch(self.locale.order_id_regular, title.strip()) + assert m is not None - shipped_pattern = '^Shipped on ([^\\n]+)$' - nonshipped_headers = { - 'Service completed', - 'Preparing for Shipment', - 'Not Yet Shipped', - 'Shipping now' - } + logger.debug('...finished parsing invoice.') + + return Order( + order_date=order_date, + order_id=m.group(1), + shipments=shipments, + credit_card_transactions=credit_card_transactions, + tax=tax, + errors=sum((shipment.errors + for shipment in shipments), cast(Errors, [])) + errors, + **output_fields) + + @staticmethod + def get_text_lines(parent_node): + text_lines = [''] + for node in parent_node.children: + if isinstance(node, bs4.NavigableString): + text_lines[-1] += str(node) + elif node.name == 'br': + text_lines.append('') + else: + text_lines[-1] += node.text + return text_lines - def is_shipment_header_table(node): - if node.name != 'table': - return False - text = node.text.strip() - m = re.match(shipped_pattern, text) - return m is not None or text in nonshipped_headers - header_tables = soup.find_all(is_shipment_header_table) + def parse_digital_order_invoice(self, path: str) -> Optional[Order]: + errors = [] + with open(path, 'rb') as f: + soup = bs4.BeautifulSoup(f.read(), 'lxml') - shipments = [] # type: List[Shipment] - errors = [] # type: Errors + logger.debug('check if order has been cancelled...') + def is_cancelled_order(node): + return node.text.strip() == self.locale.order_cancelled - for header_table in header_tables: - text = header_table.text.strip() - shipped_date = None - if text not in nonshipped_headers: - m = re.match(shipped_pattern, text) - assert m is not None - shipped_date = dateutil.parser.parse(m.group(1)).date() + if soup.find(is_cancelled_order): + return None - items = [] + logger.debug('parsing header...') + def is_digital_order_row(node): + if node.name != 'tr': + return False + m = re.match(self.locale.digital_order, node.text.strip()) + if m is None: + return False + try: + self.locale.parse_date(m.group(1)) + return True + except: + return False - shipment_table = header_table.find_parent('table') + # Find Digital Order row + digital_order_header = soup.find(is_digital_order_row) + digital_order_table = digital_order_header.find_parent('table') + m = re.match(self.locale.digital_order, digital_order_header.text.strip()) + assert m is not None + order_date = self.locale.parse_date(m.group(1)) + logger.debug('parsing items...') def is_items_ordered_header(node): if node.name != 'tr': return False tds = node('td') if len(tds) < 2: return False - return (tds[0].text.strip() == 'Items Ordered' and - tds[1].text.strip() == 'Price') + return (tds[0].text.strip() == self.locale.items_ordered and + tds[1].text.strip() == self.locale.price) - items_ordered_header = shipment_table.find(is_items_ordered_header) + items_ordered_header = digital_order_table.find(is_items_ordered_header) item_rows = items_ordered_header.find_next_siblings('tr') + items = [] + + other_fields_td = None for item_row in item_rows: tds = item_row('td') + if len(tds) != 2: + other_fields_td = tds[0] + continue description_node = tds[0] price_node = tds[1] price = price_node.text.strip() - price = parse_amount(price) - if price is None: - price = Amount(D(0), 'USD') - - # 1 of: 365 Everyday Value, Potato Yellow Bag Organic, 48 Ounce - # 2 (1.04 lb) of: Broccoli Crowns Conventional, 1 Each - # 2.07 lb of: Pork Sausage Link Italian Mild Step 1 - - pattern_quantity = r'^\s*(?:(?P[0-9]+)|(?P[0-9.]+\s+(?:lb|kg))|(?:(?P[0-9.]+) [(](?P[^)]+)[)]))\s+of:' - m = re.match(pattern_quantity, description_node.text, re.UNICODE|re.DOTALL) - quantity = 1 - if m is not None: - # Amazon will say you got, e.g. 2 broccoli crowns at $1.69/lb - but then this code multiplies the 2 by the price listed - # on the invoice, which is the total price in this case (but the per-unit price in other cases) - so if there's a quantity - # and a weight, ignore the quantity and treat it as 1 - # alternately, capture the weight and the per-unit price and multiply out - quantity = m.group("quantity") # ignore quantity for weight items - - if quantity is None: - #print("Unable to extract quantity, using 1: %s" % description_node.text) - quantity = D(1) + a = description_node.find('a') + if a is not None: + description = a.text.strip() + url = a['href'] else: - quantity = D(quantity) + bold_node = description_node.find('b') + description = bold_node.text.strip() + url = None - text = description_node.text.split("of:",1)[1] + text_lines = self.get_text_lines(description_node) - pattern_without_condition = r'(?P.*)\n\s*(?:Sold|Provided) by:? (?P[^\n]+)' - pattern_with_condition = pattern_without_condition + r'\n.*\n\s*Condition: (?P[^\n]+)' + def get_label_value(label): + for line in text_lines: + m = re.match(r'^\s*' + label + ': (.*)$', line, + re.UNICODE | re.DOTALL) + if m is None: + continue + return m.group(1) + + by = get_label_value(self.locale.by) + sold_by = get_label_value(self.locale.sold_by) - m = re.match(pattern_with_condition, text, re.UNICODE | re.DOTALL) - if m is None: - m = re.match(pattern_without_condition, text, re.UNICODE | re.DOTALL) - if m is None: - raise Exception("Could not extract item from row", text) - - description = re.sub(r'\s+', ' ', m.group('description').strip()) - sold_by = re.sub(r'\s+', ' ', m.group('sold_by').strip()) - try: - condition = re.sub(r'\s+', ' ', m.group('condition').strip()) - except IndexError: - condition = None - suffix = ' (seller profile)' - if sold_by.endswith(suffix): - sold_by = sold_by[:-len(suffix)] items.append( - Item( - quantity=quantity, + DigitalItem( description=description, + by=by, sold_by=sold_by, - condition=condition, - price=price, + url=url, + price=self.locale.parse_amount(price), )) - items_subtotal = parse_amount( - get_field_in_table(shipment_table, r'Item\(s\) Subtotal:')) - expected_items_subtotal = reduce_amounts( - beancount.core.amount.mul(x.price, D(x.quantity)) for x in items) - if (items_subtotal is not None and - expected_items_subtotal != items_subtotal): - errors.append( - 'expected items subtotal is %r, but parsed value is %r' % - (expected_items_subtotal, items_subtotal)) - + other_fields_text_lines = self.get_text_lines(other_fields_td) + + logger.debug('parsing amounts...') + def get_other_field(pattern, allow_multiple=False, return_label=False): + results = [] + for line in other_fields_text_lines: + r = r'^\s*(' + pattern + r')\s+(.*[^\s])\s*$' + m = re.match(r, line, re.UNICODE) + if m is not None: + results.append((m.group(1).strip(':'), m.group(2))) + if not return_label: + results = [r[1] for r in results] + if not allow_multiple: + if not results: + return None + return results[0] + return results + + def get_adjustments(pattern): + adjustments = [] + for label, amount_str in get_other_field( + pattern, allow_multiple=True, return_label=True): + adjustments.append( + Adjustment(amount=self.locale.parse_amount(amount_str), description=label)) + return adjustments + + def get_amounts_in_text(pattern_map): + amounts = dict() + for key, label in pattern_map.items(): + amount = self.locale.parse_amount(get_other_field(label)) + amounts[key] = amount + return amounts + + items_subtotal = self.locale.parse_amount( + get_other_field(self.locale.items_subtotal_regex)) + total_before_tax = self.locale.parse_amount( + get_other_field(self.locale.total_before_tax_regex)) + tax = get_adjustments(self.locale.tax_collected_digital) + total_for_this_order = self.locale.parse_amount( + get_other_field(self.locale.total_order_digital)) + + logger.debug('parsing pretax adjustments...') output_fields = dict() - output_fields['pretax_adjustments'] = get_adjustments_in_table( - shipment_table, pretax_adjustment_fields_pattern) - output_fields['posttax_adjustments'] = get_adjustments_in_table( - shipment_table, posttax_adjustment_fields_pattern) - pretax_parts = [items_subtotal or expected_items_subtotal] + [ - a.amount for a in output_fields['pretax_adjustments'] - ] - total_before_tax = parse_amount( - get_field_in_table(shipment_table, 'Total before tax:')) - expected_total_before_tax = reduce_amounts(pretax_parts) - if total_before_tax is None: - total_before_tax = expected_total_before_tax - elif expected_total_before_tax != total_before_tax: - errors.append( - 'expected total before tax is %s, but parsed value is %s' % - (expected_total_before_tax, total_before_tax)) - - sales_tax = get_adjustments_in_table(shipment_table, 'Sales Tax:') - - posttax_parts = ( - [total_before_tax] + [a.amount for a in sales_tax] + - [a.amount for a in output_fields['posttax_adjustments']]) - total = parse_amount( - get_field_in_table(shipment_table, 'Total for This Shipment:')) - expected_total = reduce_amounts(posttax_parts) - if total is None: - total = expected_total - elif expected_total != total: + output_fields['pretax_adjustments'] = get_adjustments( + self.locale.pretax_adjustment_fields_pattern) + pretax_parts = ([items_subtotal] + + [a.amount for a in output_fields['pretax_adjustments']]) + logger.debug(pretax_parts) + logger.debug(total_before_tax) + expected_total_before_tax = self.reduce_amounts(pretax_parts) + if expected_total_before_tax != total_before_tax: + errors.append('expected total before tax is %s, but parsed value is %s' + % (expected_total_before_tax, total_before_tax)) + + logger.debug('parsing posttax adjustments...') + output_fields['posttax_adjustments'] = get_adjustments( + self.locale.posttax_adjustment_fields_pattern) + posttax_parts = ([total_before_tax] + [a.amount for a in tax] + + [a.amount for a in output_fields['posttax_adjustments']]) + expected_total = self.reduce_amounts(posttax_parts) + + logger.debug(total_for_this_order) + if expected_total != total_for_this_order: errors.append('expected total is %s, but parsed value is %s' % - (expected_total, total)) - - shipments.append( - Shipment( - shipped_date=shipped_date, - items=items, - items_subtotal=items_subtotal, - total_before_tax=total_before_tax, - tax=sales_tax, - total=total, - errors=errors, - **output_fields)) - - return shipments - - -def parse_credit_card_transactions_from_payments_table( - payment_table, - order_date: datetime.date) -> List[CreditCardTransaction]: - payment_text = '\n'.join(payment_table.strings) - m = re.search(r'\n\s*Grand Total:\s+(.*)\n', payment_text) - assert m is not None - grand_total = parse_amount(m.group(1).strip()) - - m = re.search( - r'\n\s*([^\s|][^|\n]*[^|\s])\s+\|\s+Last (?:4 )?digits:\s+([0-9]{4})\n', - payment_text) - if m is None: - m = re.search(r'\n\s*(.+)\s+ending in\s+([0-9]{4})\n', payment_text) - - if m is not None: - credit_card_transactions = [ - CreditCardTransaction( - date=order_date, - amount=grand_total, - card_description=m.group(1).strip(), - card_ending_in=m.group(2).strip(), + (expected_total, total_for_this_order)) + + if self.locale.tax_included_in_price: + tax = [] + + shipment = Shipment( + shipped_date=order_date, + items=items, + items_subtotal=items_subtotal, + total_before_tax=total_before_tax, + tax=tax, + total=total_for_this_order, + errors=errors, + **output_fields) + + order_id_td = soup.find( + lambda node: node.name == 'td' and + re.match(self.locale.order_id_digital, node.text.strip()) ) - ] - else: - credit_card_transactions = [] - return credit_card_transactions - - -def parse_credit_card_transactions(soup) -> List[CreditCardTransaction]: - def is_header_node(node): - return node.name == 'td' and node.text.strip( - ) == 'Credit Card transactions' - - header_node = soup.find(is_header_node) - if header_node is None: - return [] - sibling = header_node.find_next_sibling('td') - rows = sibling.find_all('tr') - transactions = [] - for row in rows: - if not row.text.strip(): - continue - tds = row('td') - description = tds[0].text.strip() - amount_text = tds[1].text.strip() - m = re.match(r'^([^:]+) ending in ([0-9]+):\s+([^:]+):$', description, - re.UNICODE) + m = re.match(self.locale.order_id_digital, order_id_td.text.strip()) assert m is not None - transactions.append( - CreditCardTransaction( - date=dateutil.parser.parse(m.group(3)).date(), - card_description=m.group(1), - card_ending_in=m.group(2), - amount=parse_amount(amount_text), - )) - return transactions - - -def parse_invoice(path: str) -> Optional[Order]: - if os.path.basename(path).startswith('D'): - return parse_digital_order_invoice(path) - return parse_regular_order_invoice(path) - - -def parse_regular_order_invoice(path: str) -> Order: - errors = [] - with open(path, 'rb') as f: - soup = bs4.BeautifulSoup(f.read(), 'lxml') - shipments = parse_shipments(soup) - payment_table_header = soup.find( - lambda node: node.name == 'table' and re.match('^Payment information$', node.text.strip())) - - payment_table = payment_table_header.find_parent('table') - - output_fields = dict() - output_fields['pretax_adjustments'] = get_adjustments_in_table( - payment_table, pretax_adjustment_fields_pattern) - payment_adjustments = collections.OrderedDict() # type: Dict[str, Amount] - - # older invoices put pre-tax amounts on a per-shipment basis - # new invoices only put pre-tax amounts on the overall payments section - # detect which this is - pretax_amount = reduce_amounts( - a.amount for a in output_fields['pretax_adjustments']) - shipments_pretax_amount = None - - if any(s.pretax_adjustments for s in shipments): - shipments_pretax_amount = reduce_amounts(a.amount - for shipment in shipments - for a in shipment.pretax_adjustments) - - if shipments_pretax_amount != pretax_amount: - errors.append( - 'expected total pretax adjustment to be %s, but parsed total is %s' - % (shipments_pretax_amount, pretax_amount)) - - payments_total_adjustments = [] - shipments_total_adjustments = [] - - # parse first to get an idea of the working currency - grand_total = parse_amount( - get_field_in_table(payment_table, 'Grand Total:')) - - def resolve_posttax_adjustments(): - payment_adjustments.update( - reduce_adjustments( - get_adjustments_in_table(payment_table, - posttax_adjustment_fields_pattern, - assumed_currency=grand_total.currency))) - all_shipments_adjustments = collections.OrderedDict( - reduce_adjustments( - sum((x.posttax_adjustments for x in shipments), []))) - all_keys = collections.OrderedDict(payment_adjustments.items()) - all_keys.update(all_shipments_adjustments.items()) - - all_adjustments = collections.OrderedDict() # type: Dict[str, Amount] - for key in all_keys: - payment_amount = payment_adjustments.get(key) - shipments_amount = all_shipments_adjustments.get(key) - amount = payment_amount - if payment_amount is None and shipments_amount is not None: - # Amazon sometimes doesn't include adjustments in the Payments table - amount = shipments_amount - payments_total_adjustments.append(amount) - elif payment_amount is not None and shipments_amount is None: - # Amazon sometimes doesn't include these adjustments in the Shipment table - shipments_total_adjustments.append(amount) - elif payment_amount != shipments_amount: - errors.append( - 'expected total %r to be %s, but parsed total is %s' % - (key, shipments_amount, payment_amount)) - all_adjustments[key] = amount - return [Adjustment(k, v) for k, v in all_adjustments.items()] - - output_fields['posttax_adjustments'] = resolve_posttax_adjustments() - - tax = parse_amount( - get_field_in_table(payment_table, 'Estimated tax to be collected:')) - - expected_tax = reduce_amounts( - a.amount for shipment in shipments for a in shipment.tax) - if expected_tax is None: - shipments_total_adjustments.append(tax) - elif expected_tax != tax: - errors.append( - 'expected tax is %s, but parsed value is %s' % (expected_tax, tax)) - - payments_total_adjustment = reduce_amounts(payments_total_adjustments) - shipments_total_adjustment = reduce_amounts(shipments_total_adjustments) - - expected_total = add_amount(shipments_total_adjustment, - reduce_amounts(x.total for x in shipments)) - - # if no shipments pre-tax section, then the expected total isn't accounting - # for the pre-tax adjustments yet since they are only in the grand total section - if shipments_pretax_amount is None: - expected_total = add_amount(expected_total, pretax_amount) - - adjusted_grand_total = add_amount(payments_total_adjustment, grand_total) - if expected_total != adjusted_grand_total: - errors.append('expected grand total is %s, but parsed value is %s' % - (expected_total, adjusted_grand_total)) - order_placed_pattern = r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})' - - def is_order_placed_node(node): - m = re.fullmatch(order_placed_pattern, node.text.strip()) - return m is not None - - node = soup.find(is_order_placed_node) - m = re.fullmatch(order_placed_pattern, node.text.strip()) - assert m is not None - order_date = dateutil.parser.parse(m.group(1)).date() - - credit_card_transactions = parse_credit_card_transactions(soup) - if not credit_card_transactions: - credit_card_transactions = parse_credit_card_transactions_from_payments_table( - payment_table, order_date) - - if credit_card_transactions: - total_payments = reduce_amounts( - x.amount for x in credit_card_transactions) - else: - total_payments = Amount(number=ZERO, currency=grand_total.currency) - if total_payments != adjusted_grand_total: - errors.append('total payment amount is %s, but grand total is %s' % - (total_payments, adjusted_grand_total)) - - title = soup.find('title').text.strip() - m = re.fullmatch(r'.*Order ([0-9\-]+)', title.strip()) - assert m is not None - - return Order( - order_date=order_date, - order_id=m.group(1), - shipments=shipments, - credit_card_transactions=credit_card_transactions, - tax=tax, - errors=sum((shipment.errors - for shipment in shipments), cast(Errors, [])) + errors, - **output_fields) - - -def get_text_lines(parent_node): - text_lines = [''] - for node in parent_node.children: - if isinstance(node, bs4.NavigableString): - text_lines[-1] += str(node) - elif node.name == 'br': - text_lines.append('') - else: - text_lines[-1] += node.text - return text_lines - - -def parse_digital_order_invoice(path: str) -> Optional[Order]: - errors = [] - with open(path, 'rb') as f: - soup = bs4.BeautifulSoup(f.read(), 'lxml') - - def is_cancelled_order(node): - return node.text.strip() == 'Order Canceled' - - if soup.find(is_cancelled_order): - return None - - digital_order_pattern = 'Digital Order: (.*)' - - def is_digital_order_row(node): - if node.name != 'tr': - return False - m = re.match(digital_order_pattern, node.text.strip()) - if m is None: - return False - try: - dateutil.parser.parse(m.group(1)) - return True - except: - return False - - # Find Digital Order row - digital_order_header = soup.find(is_digital_order_row) - digital_order_table = digital_order_header.find_parent('table') - m = re.match(digital_order_pattern, digital_order_header.text.strip()) - assert m is not None - order_date = dateutil.parser.parse(m.group(1)).date() - - def is_items_ordered_header(node): - if node.name != 'tr': - return False - tds = node('td') - if len(tds) < 2: - return False - return (tds[0].text.strip() == 'Items Ordered' and - tds[1].text.strip() == 'Price') - - items_ordered_header = digital_order_table.find(is_items_ordered_header) - - item_rows = items_ordered_header.find_next_siblings('tr') - items = [] - - other_fields_td = None - - for item_row in item_rows: - tds = item_row('td') - if len(tds) != 2: - other_fields_td = tds[0] - continue - description_node = tds[0] - price_node = tds[1] - price = price_node.text.strip() - - a = description_node.find('a') - if a is not None: - description = a.text.strip() - url = a['href'] - else: - bold_node = description_node.find('b') - description = bold_node.text.strip() - url = None + order_id = m.group(1) - text_lines = get_text_lines(description_node) - - def get_label_value(label): - for line in text_lines: - m = re.match(r'^\s*' + label + ': (.*)$', line, - re.UNICODE | re.DOTALL) - if m is None: - continue - return m.group(1) - - by = get_label_value('By') - sold_by = get_label_value(r'Sold\s+By') - - items.append( - DigitalItem( - description=description, - by=by, - sold_by=sold_by, - url=url, - price=parse_amount(price), - )) - - other_fields_text_lines = get_text_lines(other_fields_td) - - def get_other_field(pattern, allow_multiple=False, return_label=False): - results = [] - for line in other_fields_text_lines: - r = r'^\s*(' + pattern + r')\s+(.*[^\s])\s*$' - m = re.match(r, line, re.UNICODE) - if m is not None: - results.append((m.group(1).strip(':'), m.group(2))) - if not return_label: - results = [r[1] for r in results] - if not allow_multiple: - if not results: - return None - return results[0] - return results + logger.debug('parsing payment information...') + payment_table = soup.find( + lambda node: node.name == 'table' and + node.text.strip().startswith(self.locale.payment_information_digital) + ) + credit_card_transactions = self.parse_credit_card_transactions_from_payments_table( + payment_table, order_date) - def get_adjustments(pattern): - adjustments = [] - for label, amount_str in get_other_field( - pattern, allow_multiple=True, return_label=True): - adjustments.append( - Adjustment(amount=parse_amount(amount_str), description=label)) - return adjustments + logger.debug('...finished') - def get_amounts_in_text(pattern_map): - amounts = dict() - for key, label in pattern_map.items(): - amount = parse_amount(get_other_field(label)) - amounts[key] = amount - return amounts - - items_subtotal = parse_amount(get_other_field(r'Item\(s\) Subtotal:')) - total_before_tax = parse_amount(get_other_field('Total Before Tax:')) - tax = get_adjustments('Tax Collected:') - total_for_this_order = parse_amount( - get_other_field('Total for this Order:')) - output_fields = dict() - output_fields['pretax_adjustments'] = get_adjustments( - pretax_adjustment_fields_pattern) - pretax_parts = ([items_subtotal] + - [a.amount for a in output_fields['pretax_adjustments']]) - expected_total_before_tax = reduce_amounts(pretax_parts) - if expected_total_before_tax != total_before_tax: - errors.append('expected total before tax is %s, but parsed value is %s' - % (expected_total_before_tax, total_before_tax)) - output_fields['posttax_adjustments'] = get_adjustments( - posttax_adjustment_fields_pattern) - posttax_parts = ([total_before_tax] + [a.amount for a in tax] + - [a.amount for a in output_fields['posttax_adjustments']]) - expected_total = reduce_amounts(posttax_parts) - if expected_total != total_for_this_order: - errors.append('expected total is %s, but parsed value is %s' % - (expected_total, total_for_this_order)) - - shipment = Shipment( - shipped_date=order_date, - items=items, - items_subtotal=items_subtotal, - total_before_tax=total_before_tax, - tax=tax, - total=total_for_this_order, - errors=errors, - **output_fields) - - order_id_pattern = '^Amazon.com\\s+order number:\\s+(D[0-9-]+)$' - - order_id_td = soup.find(lambda node: node.name == 'td' and re.match(order_id_pattern, node.text.strip())) - m = re.match(order_id_pattern, order_id_td.text.strip()) - assert m is not None - order_id = m.group(1) - - payment_table = soup.find( - lambda node: node.name == 'table' and node.text.strip().startswith('Payment Information') - ) - credit_card_transactions = parse_credit_card_transactions_from_payments_table( - payment_table, order_date) - - return Order( - order_date=order_date, - order_id=order_id, - shipments=[shipment], - credit_card_transactions=credit_card_transactions, - pretax_adjustments=[], - posttax_adjustments=output_fields['posttax_adjustments'], - tax=[], - errors=[]) + return Order( + order_date=order_date, + order_id=order_id, + shipments=[shipment], + credit_card_transactions=credit_card_transactions, + pretax_adjustments=[], + posttax_adjustments=output_fields['posttax_adjustments'], + tax=None, + errors=[]) def main(): @@ -713,13 +966,18 @@ def main(): default=False, action='store_true', help='Output in JSON format.') + # ToDo: add locale argument + # ap.add_argument( + # '--locale', default='EN', help='Local Amazon settings, defaults to EN') ap.add_argument('paths', nargs='*') + args = ap.parse_args() + amz_inv = AmazonInvoice() results = [] for path in args.paths: try: - result = parse_invoice(path) + result = amz_inv.parse_invoice(path) results.append(result) except: sys.stderr.write('Error reading: %s\n' % path) diff --git a/beancount_import/source/amazon_invoice_test.py b/beancount_import/source/amazon_invoice_test.py index bb2ac06f..6eb8e382 100644 --- a/beancount_import/source/amazon_invoice_test.py +++ b/beancount_import/source/amazon_invoice_test.py @@ -19,7 +19,8 @@ ]) def test_parsing(name: str): source_path = os.path.join(testdata_dir, name + '.html') - invoice = amazon_invoice.parse_invoice(source_path) + amz_inv = amazon_invoice.AmazonInvoice() + invoice = amz_inv.parse_invoice(source_path) json_path = os.path.join(testdata_dir, name + '.json') expected = json.load( open(json_path, 'r'), object_pairs_hook=collections.OrderedDict) From 75097fdf28a480b876af334ed6bfae7fc3db07d4 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 12 Dec 2021 20:53:54 +0100 Subject: [PATCH 02/42] update test reference output --- testdata/source/amazon/D56-5204779-4181560.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testdata/source/amazon/D56-5204779-4181560.json b/testdata/source/amazon/D56-5204779-4181560.json index 35b62d78..09aac4e8 100644 --- a/testdata/source/amazon/D56-5204779-4181560.json +++ b/testdata/source/amazon/D56-5204779-4181560.json @@ -54,7 +54,7 @@ } ], "pretax_adjustments": [], - "tax": [], + "tax": null, "posttax_adjustments": [], "errors": [] } From 7570a8b8e3b10459cbe2a819d08e65eb74cf16a5 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sat, 18 Dec 2021 14:37:00 +0100 Subject: [PATCH 03/42] addressed some PR comments, polishing --- beancount_import/source/amazon.py | 11 +- beancount_import/source/amazon_invoice.py | 1627 +++++++++-------- .../source/amazon_invoice_test.py | 3 +- 3 files changed, 835 insertions(+), 806 deletions(-) diff --git a/beancount_import/source/amazon.py b/beancount_import/source/amazon.py index d7d22f5c..e7a1b08b 100644 --- a/beancount_import/source/amazon.py +++ b/beancount_import/source/amazon.py @@ -268,6 +268,7 @@ import os import sys import pickle +import logging from beancount.core.data import Transaction, Posting, Balance, Commodity, Price, EMPTY_SET, Directive from beancount.core.amount import Amount @@ -275,7 +276,7 @@ from beancount.core.number import ZERO, ONE import beancount.core.amount -from .amazon_invoice import AmazonInvoice, DigitalItem, Order +from .amazon_invoice import LOCALES, parse_invoice, DigitalItem, Order from ..matching import FIXME_ACCOUNT, SimpleInventory from ..posting_date import POSTING_DATE_KEY, POSTING_TRANSACTION_DATE_KEY @@ -284,6 +285,8 @@ import datetime +logger = logging.getLogger('amazon') + ITEM_DESCRIPTION_KEY = 'amazon_item_description' ITEM_URL_KEY = 'amazon_item_url' ITEM_BY_KEY = 'amazon_item_by' @@ -543,7 +546,7 @@ def __init__(self, posttax_adjustment_accounts: Dict[str, str] = {}, pickle_dir: str = None, earliest_date: datetime.date = None, - locale='EN', + locale='en_EN', **kwargs) -> None: super().__init__(**kwargs) self.directory = directory @@ -556,7 +559,7 @@ def __init__(self, self.pickler = AmazonPickler(pickle_dir) self.earliest_date = earliest_date - self.amz_inv = AmazonInvoice(locale=locale) + self.locale = LOCALES[locale]() self.invoice_filenames = [] # type: List[Tuple[str, str]] for filename in os.listdir(self.directory): @@ -576,7 +579,7 @@ def _get_invoice(self, results: SourceResults, order_id: str, invoice_filename: invoice = self.pickler.load(results, invoice_path) # type: Optional[Order] if invoice is None: self.log_status('amazon: processing %s: %s' % (order_id, invoice_path, )) - invoice = self.amz_inv.parse_invoice(invoice_path) + invoice = parse_invoice(invoice_path, locale=self.locale) self.pickler.dump( results, invoice_path, invoice ) self._cached_invoices[invoice_filename] = invoice, invoice_path diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 579a6a16..930ca243 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -1,6 +1,7 @@ """Parses an Amazon.com/.de regular or digital order details HTML file.""" -from typing import NamedTuple, Optional, List, Union, Iterable, Dict, Sequence, cast +from typing import NamedTuple, Optional, List, Union, Iterable, Dict, Sequence, cast, Any +import dataclasses import collections import re import os @@ -16,162 +17,218 @@ from ..amount_parsing import parse_amount, parse_number -logger = logging.getLogger('amazon') - - -class Locale_EN(): - LOCALE = 'EN' - tax_included_in_price = False - shipped_pattern = '^Shipped on ([^\\n]+)$' - nonshipped_headers = { - 'Service completed', - 'Preparing for Shipment', - 'Not Yet Shipped', - 'Shipping now' - } - items_ordered = 'Items Ordered' - price = 'Price' - currency = 'USD' - of = 'of:' - seller_profile = ' (seller profile)' - items_subtotal_regex = r'Item\(s\) Subtotal:' - total_before_tax_regex = 'Total Before Tax:' - sales_tax_shipment = 'Sales Tax:' - total_shipment = 'Total for This Shipment:' - - pattern_without_condition = r'(?P.*)\n\s*(?:Sold|Provided) by:? (?P[^\n]+)' - pattern_with_condition = pattern_without_condition + r'\n.*\n\s*Condition: (?P[^\n]+)' +logger = logging.getLogger('amazon_invoice') + + +@dataclasses.dataclass +class Locale_Data(): + LOCALE: str + tax_included_in_price: bool + + # common fields regular and digital orders + items_ordered: str + price: str + currency: str + items_subtotal: str + total_before_tax: str + pretax_adjustment_fields_pattern: str + posttax_adjustment_fields_pattern: str # Payment Table & Credit Card Transactions - grand_total_regex = r'\n\s*Grand Total:\s+(.*)\n' - credit_card_transactions = 'Credit Card transactions' - last_digits_regex = r'^([^:]+) ending in ([0-9]+):\s+([^:]+):$' - payment_type_regexes = [ - # only first matching regex is used! - r'\n\s*([^\s|][^|\n]*[^|\s])\s+\|\s+Last (?:4 )?digits:\s+([0-9]{4})\n', - r'\n\s*(.+)\s+ending in\s+([0-9]{4})\n' - ] - payment_information = '^Payment information$' - grand_total = 'Grand Total:' - - # Page Header - order_placed_regex = r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})' - order_id_regular = r'.*Order ([0-9\-]+)' - - # digital invoice - order_cancelled = 'Order Canceled' - digital_order = 'Digital Order: (.*)' - by = 'By' - sold_by = r'Sold\s+By' - tax_collected_digital = 'Tax Collected:' - estimated_tax = 'Estimated tax to be collected:' - total_order_digital = 'Total for this Order:' - order_id_digital = '^Amazon.com\\s+order number:\\s+(D[0-9-]+)$' - payment_information_digital = 'Payment Information' - - pretax_adjustment_fields_pattern = ('(?:' + '|'.join([ - 'Shipping & Handling', # Verpackung & Versand: - 'Free Shipping', - 'Free delivery', - 'Pantry delivery', - 'Promotion(?:s| Applied)', # Gutschein eingelöst: - 'Lightning Deal', - 'Your Coupon Savings', - '[0-9]+% off savings', - 'Subscribe & Save', - '[0-9]+ Audible Credit Applied', - '.*[0-9]+% Off.*', - 'Courtesy Credit', - 'Extra Savings', - '(?:.*) Discount', - 'Gift[ -]Wrap', - ]) + ') *:') - posttax_adjustment_fields_pattern = r'Gift Card Amount:|Rewards Points:|Tip [(]optional[)]:|Recycle Fee \$X' + grand_total: str + credit_card_transactions: str + credit_card_last_digits: str + payment_type: List[str] + payment_information: str + + # regular orders only + shipment_shipped_pattern: str + shipment_nonshipped_headers: List[str] + shipment_quantity: str + shipment_of: str + shipment_sales_tax: str + shipment_total: str + shipment_seller_profile: str + shipment_sold_by: str + shipment_condition: str + regular_total_order: str + regular_estimated_tax: str + regular_order_placed: str + regular_order_id: str + + # digital orders only + digital_order: str + digital_order_cancelled: str + digital_by: str + digital_sold_by: str + digital_tax_collected: str + digital_total_order: str + digital_order_id: str + digital_payment_information: str + + +class Locale_en_EN(Locale_Data): + """Language and region specific settings for parsing amazon.com invoices + """ + def __init__(self) -> None: + super().__init__( + LOCALE='en_EN', + tax_included_in_price=False, + + # common fields regular and digital orders + items_ordered='Items Ordered', # shipment + digital + price='Price', # shipment + digital + currency='USD', # shipment only + items_subtotal=r'Item\(s\) Subtotal:', # shipment +digital + total_before_tax='Total Before Tax:', # shipment + digital + pretax_adjustment_fields_pattern=('(?:' + '|'.join([ + 'Shipping & Handling', # Verpackung & Versand: + 'Free Shipping', + 'Free delivery', + 'Pantry delivery', + 'Promotion(?:s| Applied)', # Gutschein eingelöst: + 'Lightning Deal', + 'Your Coupon Savings', + '[0-9]+% off savings', + 'Subscribe & Save', + '[0-9]+ Audible Credit Applied', + '.*[0-9]+% Off.*', + 'Courtesy Credit', + 'Extra Savings', + '(?:.*) Discount', + 'Gift[ -]Wrap', + ]) + ') *:'), + posttax_adjustment_fields_pattern=r'Gift Card Amount:|Rewards Points:|Tip [(]optional[)]:|Recycle Fee \$X', + + # Payment Table & Credit Card Transactions + grand_total=r'\n\s*Grand Total:\s+(.*)\n', + credit_card_transactions='Credit Card transactions', + credit_card_last_digits=r'^([^:]+) ending in ([0-9]+):\s+([^:]+):$', + payment_type=[ + # only first matching regex is used! + r'\n\s*([^\s|][^|\n]*[^|\s])\s+\|\s+Last (?:4 )?digits:\s+([0-9]{4})\n', + r'\n\s*(.+)\s+ending in\s+([0-9]{4})\n' + ], + payment_information='^Payment information$', + + # regular orders only + shipment_shipped_pattern='^Shipped on ([^\\n]+)$', + shipment_nonshipped_headers=[ + 'Service completed', + 'Preparing for Shipment', + 'Not Yet Shipped', + 'Shipping now' + ], + shipment_quantity=r'^\s*(?:(?P[0-9]+)|(?P[0-9.]+\s+(?:lb|kg))|(?:(?P[0-9.]+) [(](?P[^)]+)[)]))\s+of:', + shipment_of='of:', + shipment_sales_tax='Sales Tax:', + shipment_total='Total for This Shipment:', + shipment_seller_profile=' (seller profile)', + shipment_sold_by=r'(?P.*)\n\s*(?:Sold|Provided) by:? (?P[^\n]+)', + shipment_condition=r'\n.*\n\s*Condition: (?P[^\n]+)', + regular_total_order='Grand Total:', + regular_estimated_tax = 'Estimated tax to be collected:', + regular_order_placed=r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})', + regular_order_id=r'.*Order ([0-9\-]+)', + + # digital orders only + digital_order='Digital Order: (.*)', + digital_order_cancelled='Order Canceled', + digital_by='By', + digital_sold_by=r'Sold\s+By', + digital_tax_collected='Tax Collected:', + digital_total_order='Total for this Order:', + digital_order_id='^Amazon.com\\s+order number:\\s+(D[0-9-]+)$', + digital_payment_information='Payment Information' + ) @staticmethod def parse_amount(amount, assumed_currency=None) -> Amount: return parse_amount(amount, assumed_currency=assumed_currency) @staticmethod - def parse_date(date_str) -> str: + def parse_date(date_str) -> datetime.date: return dateutil.parser.parse(date_str).date() -class Locale_DE(): +class Locale_de_DE(Locale_Data): """Language and region specific settings for parsing amazon.de invoices """ - LOCALE = 'DE' - tax_included_in_price = True # no separate tax transactions - shipped_pattern = '^versandt am ([^\\n]+)$' - nonshipped_headers = { # Translations missing - 'Service completed', - 'Preparing for Shipment', - 'Not Yet Shipped', - 'Shipping now' - } - items_ordered = 'Bestellte Artikel' - price = 'Preis' - currency = 'EUR' - of = 'Exemplar(e) von:' - seller_profile = ' (Mitgliedsprofil)' - items_subtotal_regex = 'Zwischensumme:' - total_before_tax_regex = 'Summe ohne MwSt.:' - sales_tax_shipment = 'Anzurechnende MwSt.:' # not sure (only old invoices) - total_shipment = 'Gesamtsumme:' - - pattern_without_condition = r'(?P.*)\n\s*(?:Verkauf|Provided) durch:? (?P[^\n]+)' - # Provided by: Translation missing - pattern_with_condition = pattern_without_condition + r'\n.*\n\s*Zustand: (?P[^\n]+)' - - # Payment Table & Credit Card Transactions - grand_total_regex = r'\n\s*(?:Gesamtsumme|Endsumme):\s+(.*)\n' # regular: Gesamtsumme, digital: Endsumme - credit_card_transactions = 'Kreditkarten-Transaktionen' - last_digits_regex = r'^([^:]+) mit den Endziffern ([0-9]+):\s+([^:]+):$' - payment_type_regexes = [ - # only first matching regex is used! - r'\n\s*([^\s|][^|\n]*[^|\s])\s+\|\s+Die letzten (?:4 )?Ziffern:\s+([0-9]{3,4})\n', # 3 digits for Bankeinzug - r'\n\s*(.+)\s+mit den Endziffern\s+([0-9]{4})\n' - ] - payment_information = '^Zahlungsdaten$' - grand_total = 'Gesamtsumme:' - - # Page Header - order_placed_regex = r'(?:Subscribe and Save )?Bestellung aufgegeben am:\s+(\d+\. [^\s]+ \d{4})' - # Translation missing: Subscribe and Save -> Sparabo?? - order_id_regular = r'.*Bestellung ([0-9\-]+)' - - # digital invoice - order_cancelled = 'Order Canceled' - digital_order = 'Digitale Bestellung: (.*)' - by = 'Von' - sold_by = r'Verkauft von' - tax_collected_digital = 'MwSt:' - estimated_tax = 'Anzurechnende MwSt.:' - total_order_digital = 'Endsumme:' - order_id_digital = '^Amazon.de\\s+Bestellnummer:\\s+(D[0-9-]+)$' - payment_information_digital = 'Zahlungsinformation' - - # most of translations still missing ... - pretax_adjustment_fields_pattern = ('(?:' + '|'.join([ - 'Verpackung & Versand', - 'Free Shipping', - 'Free delivery', - 'Pantry delivery', - 'Gutschein eingelöst', # english version not removed yet - 'Promotion(?:s| Applied)', - 'Lightning Deal', - 'Your Coupon Savings', - '[0-9]+% off savings', - 'Subscribe & Save', - '[0-9]+ Audible Credit Applied', - '.*[0-9]+% Off.*', - 'Courtesy Credit', - 'Extra Savings', - '(?:.*) Discount', - 'Gift[ -]Wrap', - ]) + ') *:') - posttax_adjustment_fields_pattern = r'Gift Card Amount:|Rewards Points:|Tip [(]optional[)]:|Recycle Fee \$X' - + def __init__(self): + super().__init__( + LOCALE='de_DE', + tax_included_in_price=True, # no separate tax transactions + + # common fields regular and digital orders + items_ordered='Bestellte Artikel', + price='Preis', + currency='EUR', + items_subtotal='Zwischensumme:', + total_before_tax='Summe ohne MwSt.:', + # most of translations still missing ... + pretax_adjustment_fields_pattern=('(?:' + '|'.join([ + 'Verpackung & Versand', + 'Free Shipping', + 'Free delivery', + 'Pantry delivery', + 'Gutschein eingelöst', # english version not removed yet + 'Promotion(?:s| Applied)', + 'Lightning Deal', + 'Your Coupon Savings', + '[0-9]+% off savings', + 'Subscribe & Save', + '[0-9]+ Audible Credit Applied', + '.*[0-9]+% Off.*', + 'Courtesy Credit', + 'Extra Savings', + '(?:.*) Discount', + 'Gift[ -]Wrap', + ]) + ') *:'), + posttax_adjustment_fields_pattern=r'Gift Card Amount:|Rewards Points:|Tip [(]optional[)]:|Recycle Fee \$X', + + # Payment Table & Credit Card Transactions + grand_total=r'\n\s*(?:Gesamtsumme|Endsumme):\s+(.*)\n', # regular: Gesamtsumme, digital: Endsumme + credit_card_transactions='Kreditkarten-Transaktionen', + credit_card_last_digits=r'^([^:]+) mit den Endziffern ([0-9]+):\s+([^:]+):$', + payment_type=[ + # only first matching regex is used! + r'\n\s*([^\s|][^|\n]*[^|\s])\s+\|\s+Die letzten (?:4 )?Ziffern:\s+([0-9]{3,4})\n', # 3 digits for Bankeinzug + r'\n\s*(.+)\s+mit den Endziffern\s+([0-9]{4})\n' + ], + payment_information='^Zahlungsdaten$', + + # regular orders only + shipment_shipped_pattern='^versandt am ([^\\n]+)$', + shipment_nonshipped_headers={ # Translations missing + 'Service completed', + 'Preparing for Shipment', + 'Not Yet Shipped', + 'Shipping now' + }, + shipment_quantity=r'^\s*(?:(?P[0-9]+)|(?P[0-9.]+\s+(?:lb|kg))|(?:(?P[0-9.]+) [(](?P[^)]+)[)]))\s+of:', + shipment_of='Exemplar(e) von:', + shipment_sales_tax='Anzurechnende MwSt.:', # not sure (only old invoices) + shipment_total='Gesamtsumme:', + shipment_seller_profile=' (Mitgliedsprofil)', + shipment_sold_by=r'(?P.*)\n\s*(?:Verkauf|Provided) durch:? (?P[^\n]+)', + # Translation missing: Provided by + shipment_condition=r'\n.*\n\s*Zustand: (?P[^\n]+)', + regular_total_order='Gesamtsumme:', + regular_estimated_tax='Anzurechnende MwSt.:', + regular_order_placed=r'(?:Subscribe and Save )?Bestellung aufgegeben am:\s+(\d+\. [^\s]+ \d{4})', + # Translation missing: Subscribe and Save -> Sparabo?? + regular_order_id=r'.*Bestellung ([0-9\-]+)', + + # digital orders only + digital_order_cancelled='Order Canceled', + digital_order='Digitale Bestellung: (.*)', + digital_by='Von', + digital_sold_by=r'Verkauft von', + digital_tax_collected='MwSt:', + digital_total_order='Endsumme:', + digital_order_id='^Amazon.de\\s+Bestellnummer:\\s+(D[0-9-]+)$', + digital_payment_information='Zahlungsinformation' + ) @staticmethod def _format_number_str(value: str) -> str: @@ -186,7 +243,7 @@ def parse_amount(amount: str, assumed_currency=None) -> Amount: return None else: return parse_amount( - Locale_DE._format_number_str(amount), + Locale_de_DE._format_number_str(amount), assumed_currency=assumed_currency) class _parserinfo(dateutil.parser.parserinfo): @@ -198,11 +255,12 @@ class _parserinfo(dateutil.parser.parserinfo): ] @staticmethod - def parse_date(date_str) -> str: - return dateutil.parser.parse(date_str, parserinfo=Locale_DE._parserinfo(dayfirst=True)).date() + def parse_date(date_str) -> datetime.date: + return dateutil.parser.parse(date_str, parserinfo=Locale_de_DE._parserinfo(dayfirst=True)).date() -LOCALE = {x.LOCALE : x for x in [Locale_EN, Locale_DE]} +LOCALES_type = Dict[str, Any] +LOCALES: LOCALES_type = {'en_EN': Locale_en_EN, 'de_DE': Locale_de_DE} Errors = List[str] Adjustment = NamedTuple('Adjustment', [ @@ -267,692 +325,663 @@ def to_json(obj): return obj -class AmazonInvoice(): - def __init__(self, locale='EN'): - self.locale = LOCALE[locale] - - @staticmethod - def add_amount(a: Optional[Amount], b: Optional[Amount]) -> Optional[Amount]: - """Add two amounts, amounts with value `None` are ignored. - """ - if a is None: - return b - if b is None: - return a - return beancount.core.amount.add(a, b) - - @staticmethod - def reduce_amounts(amounts: Iterable[Amount]) -> Optional[Amount]: - """Reduce iterable of amounts to sum by applying `add_amount`. - """ - return functools.reduce(AmazonInvoice.add_amount, amounts, None) - - @staticmethod - def get_field_in_table(table, pattern, allow_multiple=False, - return_label=False): - def predicate(node): - return node.name == 'td' and re.fullmatch(pattern, node.text.strip(), - re.I) is not None - - tds = table.find_all(predicate) - results = [(td.text.strip().strip(':'), - td.find_next_sibling('td').text.strip()) for td in tds] - if not return_label: - results = [r[1] for r in results] - if not allow_multiple: - if not results: - return None - return results[0] - return results - - def get_adjustments_in_table(self, table, pattern, assumed_currency=None): - adjustments = [] - for label, amount_str in AmazonInvoice.get_field_in_table( - table, pattern, allow_multiple=True, return_label=True): - adjustments.append( - Adjustment(amount=self.locale.parse_amount(amount_str, assumed_currency), - description=label)) - return adjustments - - @staticmethod - def reduce_adjustments(adjustments: List[Adjustment]) -> List[Adjustment]: - all_adjustments = collections.OrderedDict() # type: Dict[str, List[Amount]] - for adjustment in adjustments: - all_adjustments.setdefault(adjustment.description, - []).append(adjustment.amount) - return [ - Adjustment(k, AmazonInvoice.reduce_amounts(v)) for k, v in all_adjustments.items() - ] - - - def parse_shipments(self, soup) -> List[Shipment]: - """ - Parses Shipment Table Part of HTML document (1st Table) - """ - - # shipped_pattern = '^Shipped on ([^\\n]+)$' - # # versandt am 27. September 2021 - # # Shipped on February 8, 2016 - # nonshipped_headers = { - # 'Service completed', - # 'Preparing for Shipment', - # 'Not Yet Shipped', - # 'Shipping now' - # } - - def is_shipment_header_table(node): - if node.name != 'table': - return False - text = node.text.strip() - m = re.match(self.locale.shipped_pattern, text) - return m is not None or text in self.locale.nonshipped_headers - - header_tables = soup.find_all(is_shipment_header_table) - - shipments = [] # type: List[Shipment] - errors = [] # type: Errors - - for header_table in header_tables: - text = header_table.text.strip() - shipped_date = None - if text not in self.locale.nonshipped_headers: - m = re.match(self.locale.shipped_pattern, text) - assert m is not None - shipped_date = self.locale.parse_date(m.group(1)) - - items = [] - - shipment_table = header_table.find_parent('table') - - def is_items_ordered_header(node): - if node.name != 'tr': - return False - tds = node('td') - if len(tds) < 2: - return False - return (tds[0].text.strip() == self.locale.items_ordered and - tds[1].text.strip() == self.locale.price) - # Items Ordered - # Bestellte Artikel - # Price - # Preis - - items_ordered_header = shipment_table.find(is_items_ordered_header) - - item_rows = items_ordered_header.find_next_siblings('tr') - - logger.info('Parsing Shipment Items') - for item_row in item_rows: - tds = item_row('td') - description_node = tds[0] - price_node = tds[1] - price = price_node.text.strip() - - if price is None: - price = Amount(D(0), self.locale.currency) - # EUR 16,99 - # $11.87 - else: - price = self.locale.parse_amount(price) - - # 1 of: 365 Everyday Value, Potato Yellow Bag Organic, 48 Ounce - # 2 (1.04 lb) of: Broccoli Crowns Conventional, 1 Each - # 2.07 lb of: Pork Sausage Link Italian Mild Step 1 - - pattern_quantity = r'^\s*(?:(?P[0-9]+)|(?P[0-9.]+\s+(?:lb|kg))|(?:(?P[0-9.]+) [(](?P[^)]+)[)]))\s+of:' - # ToDo: check if this matches all locales, e.g. 'of' and units - m = re.match(pattern_quantity, description_node.text, re.UNICODE|re.DOTALL) - quantity = 1 - if m is not None: - # Amazon will say you got, e.g. 2 broccoli crowns at $1.69/lb - but then this code multiplies the 2 by the price listed - # on the invoice, which is the total price in this case (but the per-unit price in other cases) - so if there's a quantity - # and a weight, ignore the quantity and treat it as 1 - # alternately, capture the weight and the per-unit price and multiply out - quantity = m.group("quantity") # ignore quantity for weight items - - if quantity is None: - #print("Unable to extract quantity, using 1: %s" % description_node.text) - quantity = D(1) - else: - quantity = D(quantity) - - text = description_node.text.split(self.locale.of, 1)[1] - # Übersetzung fehlt - - m = re.match(self.locale.pattern_with_condition, text, re.UNICODE | re.DOTALL) - if m is None: - m = re.match(self.locale.pattern_without_condition, text, re.UNICODE | re.DOTALL) - if m is None: - raise Exception("Could not extract item from row", text) - - description = re.sub(r'\s+', ' ', m.group('description').strip()) - sold_by = re.sub(r'\s+', ' ', m.group('sold_by').strip()) - try: - condition = re.sub(r'\s+', ' ', m.group('condition').strip()) - except IndexError: - condition = None - suffix = self.locale.seller_profile - if sold_by.endswith(suffix): - sold_by = sold_by[:-len(suffix)] - items.append( - Item( - quantity=quantity, - description=description, - sold_by=sold_by, - condition=condition, - price=price, - )) - - logger.info('Parsing Shipment Amounts') - items_subtotal = self.locale.parse_amount( - self.get_field_in_table(shipment_table, self.locale.items_subtotal_regex)) - - expected_items_subtotal = self.reduce_amounts( - beancount.core.amount.mul(x.price, D(x.quantity)) for x in items) - if (items_subtotal is not None and - expected_items_subtotal != items_subtotal): - errors.append( - 'expected items subtotal is %r, but parsed value is %r' % - (expected_items_subtotal, items_subtotal)) - - output_fields = dict() - output_fields['pretax_adjustments'] = self.get_adjustments_in_table( - shipment_table, self.locale.pretax_adjustment_fields_pattern) - output_fields['posttax_adjustments'] = self.get_adjustments_in_table( - shipment_table, self.locale.posttax_adjustment_fields_pattern) - pretax_parts = [items_subtotal or expected_items_subtotal] + [ - a.amount for a in output_fields['pretax_adjustments'] - ] - total_before_tax = self.locale.parse_amount( - self.get_field_in_table(shipment_table, self.locale.total_before_tax_regex)) - expected_total_before_tax = self.reduce_amounts(pretax_parts) - if total_before_tax is None: - total_before_tax = expected_total_before_tax - elif expected_total_before_tax != total_before_tax: - errors.append( - 'expected total before tax is %s, but parsed value is %s' % - (expected_total_before_tax, total_before_tax)) - - sales_tax = self.get_adjustments_in_table(shipment_table, self.locale.sales_tax_shipment) - # Sales Tax: - # Anzurechnende MwSt.: - - posttax_parts = ( - [total_before_tax] + [a.amount for a in sales_tax] + - [a.amount for a in output_fields['posttax_adjustments']]) - total = self.locale.parse_amount( - self.get_field_in_table(shipment_table, self.locale.total_shipment)) - # Total for This Shipment: - # Gesamtsumme: - expected_total = self.reduce_amounts(posttax_parts) - if total is None: - total = expected_total - elif expected_total != total: - errors.append('expected total is %s, but parsed value is %s' % - (expected_total, total)) - - shipments.append( - Shipment( - shipped_date=shipped_date, - items=items, - items_subtotal=items_subtotal, - total_before_tax=total_before_tax, - tax=sales_tax, - total=total, - errors=errors, - **output_fields)) - - return shipments - - - def parse_credit_card_transactions_from_payments_table( - self, - payment_table, - order_date: datetime.date) -> List[CreditCardTransaction]: - """ Parse payment information from payments table. - Only type and last digits are given, no amount (assuming grand total). - Other payment methods than credit card are possible: - - Direct Debit (DE: Bankeinzug) - """ - payment_text = '\n'.join(payment_table.strings) - m = re.search(self.locale.grand_total_regex, payment_text) - assert m is not None - grand_total = self.locale.parse_amount(m.group(1).strip()) +def add_amount(a: Optional[Amount], b: Optional[Amount]) -> Optional[Amount]: + """Add two amounts, amounts with value `None` are ignored. + """ + if a is None: + return b + if b is None: + return a + return beancount.core.amount.add(a, b) - for regex in self.locale.payment_type_regexes: - m = re.search(regex, payment_text) - if m is not None: - break - # m = re.search(self.locale.last_digits_regex1, payment_text) - # if m is None: - # m = re.search(self.locale.last_digits_regex2, payment_text) +def reduce_amounts(amounts: Iterable[Amount]) -> Optional[Amount]: + """Reduce iterable of amounts to sum by applying `add_amount`. + """ + return functools.reduce(add_amount, amounts, None) + + +def get_field_in_table(table, pattern, allow_multiple=False, + return_label=False): + def predicate(node): + return node.name == 'td' and re.fullmatch(pattern, node.text.strip(), + re.I) is not None + + tds = table.find_all(predicate) + results = [(td.text.strip().strip(':'), + td.find_next_sibling('td').text.strip()) for td in tds] + if not return_label: + results = [r[1] for r in results] + if not allow_multiple: + if not results: + return None + return results[0] + return results + + +def get_adjustments_in_table(table, pattern, assumed_currency=None, locale=Locale_en_EN()): + adjustments = [] + for label, amount_str in get_field_in_table( + table, pattern, allow_multiple=True, return_label=True): + adjustments.append( + Adjustment(amount=locale.parse_amount(amount_str, assumed_currency), + description=label)) + return adjustments + + +def reduce_adjustments(adjustments: List[Adjustment]) -> List[Adjustment]: + all_adjustments = collections.OrderedDict() # type: Dict[str, List[Amount]] + for adjustment in adjustments: + all_adjustments.setdefault(adjustment.description, + []).append(adjustment.amount) + return [ + Adjustment(k, reduce_amounts(v)) for k, v in all_adjustments.items() + ] + - if m is not None: - credit_card_transactions = [ - CreditCardTransaction( - date=order_date, - amount=grand_total, - card_description=m.group(1).strip(), - card_ending_in=m.group(2).strip(), - ) - ] - else: - credit_card_transactions = [] - return credit_card_transactions - - - def parse_credit_card_transactions(self, soup) -> List[CreditCardTransaction]: - """ Parse Credit Card Transactions from bottom sub-table of payments table. - Transactions are listed with type, 4 digits, transaction date and amount. - """ - def is_header_node(node): - return node.name == 'td' and node.text.strip( - ) == self.locale.credit_card_transactions - - header_node = soup.find(is_header_node) - if header_node is None: - return [] - sibling = header_node.find_next_sibling('td') - rows = sibling.find_all('tr') - transactions = [] - for row in rows: - if not row.text.strip(): - continue - tds = row('td') - description = tds[0].text.strip() - amount_text = tds[1].text.strip() - m = re.match(self.locale.last_digits_regex, description, - re.UNICODE) +def parse_shipments(soup, locale=Locale_en_EN()) -> List[Shipment]: + """ + Parses Shipment Table Part of HTML document (1st Table) + """ + def is_shipment_header_table(node): + if node.name != 'table': + return False + text = node.text.strip() + m = re.match(locale.shipment_shipped_pattern, text) + return m is not None or text in locale.shipment_nonshipped_headers + + header_tables = soup.find_all(is_shipment_header_table) + + shipments = [] # type: List[Shipment] + errors = [] # type: Errors + + for header_table in header_tables: + logger.debug('extracting shipped date...') + text = header_table.text.strip() + shipped_date = None + if text not in locale.shipment_nonshipped_headers: + # extract shipped date if order already shipped + m = re.match(locale.shipment_shipped_pattern, text) assert m is not None - transactions.append( - CreditCardTransaction( - date=self.locale.parse_date(m.group(3)), - card_description=m.group(1), - card_ending_in=m.group(2), - amount=self.locale.parse_amount(amount_text), - )) - return transactions - - - def parse_invoice(self, path: str) -> Optional[Order]: - """ 1st method to call, distinguish between regular and digital invoice. - """ - if os.path.basename(path).startswith('D'): - logger.info('identified as digital invoice') - return self.parse_digital_order_invoice(path) - logger.info('identified as regular invoice') - return self.parse_regular_order_invoice(path) - - - def parse_regular_order_invoice(self, path: str) -> Order: - errors = [] - with open(path, 'rb') as f: - soup = bs4.BeautifulSoup(f.read(), 'lxml') - logger.info('parsing shipments...') - shipments = self.parse_shipments(soup) - logger.info('finished parsing shipments') - logger.info('parsing payment table...') - payment_table_header = soup.find( - lambda node: node.name == 'table' and re.match( - self.locale.payment_information, node.text.strip())) - - payment_table = payment_table_header.find_parent('table') - - logger.debug('parsing pretax adjustments...') - output_fields = dict() - output_fields['pretax_adjustments'] = self.get_adjustments_in_table( - payment_table, self.locale.pretax_adjustment_fields_pattern) - payment_adjustments = collections.OrderedDict() # type: Dict[str, Amount] - - # older invoices put pre-tax amounts on a per-shipment basis - # new invoices only put pre-tax amounts on the overall payments section - # detect which this is - pretax_amount = self.reduce_amounts( - a.amount for a in output_fields['pretax_adjustments']) - shipments_pretax_amount = None - - if any(s.pretax_adjustments for s in shipments): - shipments_pretax_amount = self.reduce_amounts(a.amount - for shipment in shipments - for a in shipment.pretax_adjustments) - - if shipments_pretax_amount != pretax_amount: - errors.append( - 'expected total pretax adjustment to be %s, but parsed total is %s' - % (shipments_pretax_amount, pretax_amount)) - - - logger.debug('parsing posttax adjustments...') - payments_total_adjustments = [] - shipments_total_adjustments = [] - - # parse first to get an idea of the working currency - grand_total = self.locale.parse_amount( - self.get_field_in_table(payment_table, self.locale.grand_total)) - - def resolve_posttax_adjustments(): - payment_adjustments.update( - self.reduce_adjustments( - self.get_adjustments_in_table(payment_table, - self.locale.posttax_adjustment_fields_pattern, - assumed_currency=grand_total.currency))) - all_shipments_adjustments = collections.OrderedDict( - self.reduce_adjustments( - sum((x.posttax_adjustments for x in shipments), []))) - all_keys = collections.OrderedDict(payment_adjustments.items()) - all_keys.update(all_shipments_adjustments.items()) - - all_adjustments = collections.OrderedDict() # type: Dict[str, Amount] - for key in all_keys: - payment_amount = payment_adjustments.get(key) - shipments_amount = all_shipments_adjustments.get(key) - amount = payment_amount - if payment_amount is None and shipments_amount is not None: - # Amazon sometimes doesn't include adjustments in the Payments table - amount = shipments_amount - payments_total_adjustments.append(amount) - elif payment_amount is not None and shipments_amount is None: - # Amazon sometimes doesn't include these adjustments in the Shipment table - shipments_total_adjustments.append(amount) - elif payment_amount != shipments_amount: - errors.append( - 'expected total %r to be %s, but parsed total is %s' % - (key, shipments_amount, payment_amount)) - all_adjustments[key] = amount - return [Adjustment(k, v) for k, v in all_adjustments.items()] - - output_fields['posttax_adjustments'] = resolve_posttax_adjustments() - - logger.debug('consistency check taxes...') - tax = self.locale.parse_amount( - self.get_field_in_table(payment_table, self.locale.estimated_tax)) - - expected_tax = self.reduce_amounts( - a.amount for shipment in shipments for a in shipment.tax) - if expected_tax is None: - # tax not given on shipment level - if not self.locale.tax_included_in_price: - # add tax if not already included in item prices - shipments_total_adjustments.append(tax) - elif expected_tax != tax: - errors.append( - 'expected tax is %s, but parsed value is %s' % (expected_tax, tax)) - - if self.locale.tax_included_in_price: - # tax is already inlcuded in item prices - # do not add additional transaction for taxes - tax = None - - logger.debug('consistency check grand total...') - payments_total_adjustment = self.reduce_amounts(payments_total_adjustments) - shipments_total_adjustment = self.reduce_amounts(shipments_total_adjustments) - - expected_total = self.add_amount(shipments_total_adjustment, - self.reduce_amounts(x.total for x in shipments)) - - # if no shipments pre-tax section, then the expected total isn't accounting - # for the pre-tax adjustments yet since they are only in the grand total section - if shipments_pretax_amount is None: - expected_total = self.add_amount(expected_total, pretax_amount) - - adjusted_grand_total = self.add_amount(payments_total_adjustment, grand_total) - if expected_total != adjusted_grand_total: - errors.append('expected grand total is %s, but parsed value is %s' % - (expected_total, adjusted_grand_total)) - - logger.debug('parsing order placed date...') - def is_order_placed_node(node): - m = re.fullmatch(self.locale.order_placed_regex, node.text.strip()) - return m is not None - - node = soup.find(is_order_placed_node) - m = re.fullmatch(self.locale.order_placed_regex, node.text.strip()) - assert m is not None - order_date = self.locale.parse_date(m.group(1)) - - logger.debug('parsing credit card transactions...') - credit_card_transactions = self.parse_credit_card_transactions(soup) - if not credit_card_transactions: - logger.debug('no credit card transactions table given, falling back to payments table') - credit_card_transactions = self.parse_credit_card_transactions_from_payments_table( - payment_table, order_date) - - if credit_card_transactions: - total_payments = self.reduce_amounts( - x.amount for x in credit_card_transactions) - else: - logger.info('no payment transactions found, assumig grand total as total payment amount') - total_payments = grand_total - if total_payments != adjusted_grand_total: - errors.append('total payment amount is %s, but grand total is %s' % - (total_payments, adjusted_grand_total)) - - logger.debug('parsing order ID...') - title = soup.find('title').text.strip() - m = re.fullmatch(self.locale.order_id_regular, title.strip()) - assert m is not None - - logger.debug('...finished parsing invoice.') - - return Order( - order_date=order_date, - order_id=m.group(1), - shipments=shipments, - credit_card_transactions=credit_card_transactions, - tax=tax, - errors=sum((shipment.errors - for shipment in shipments), cast(Errors, [])) + errors, - **output_fields) - - @staticmethod - def get_text_lines(parent_node): - text_lines = [''] - for node in parent_node.children: - if isinstance(node, bs4.NavigableString): - text_lines[-1] += str(node) - elif node.name == 'br': - text_lines.append('') - else: - text_lines[-1] += node.text - return text_lines - - - def parse_digital_order_invoice(self, path: str) -> Optional[Order]: - errors = [] - with open(path, 'rb') as f: - soup = bs4.BeautifulSoup(f.read(), 'lxml') + shipped_date = locale.parse_date(m.group(1)) - logger.debug('check if order has been cancelled...') - def is_cancelled_order(node): - return node.text.strip() == self.locale.order_cancelled - - if soup.find(is_cancelled_order): - return None - - logger.debug('parsing header...') - def is_digital_order_row(node): - if node.name != 'tr': - return False - m = re.match(self.locale.digital_order, node.text.strip()) - if m is None: - return False - try: - self.locale.parse_date(m.group(1)) - return True - except: - return False + items = [] - # Find Digital Order row - digital_order_header = soup.find(is_digital_order_row) - digital_order_table = digital_order_header.find_parent('table') - m = re.match(self.locale.digital_order, digital_order_header.text.strip()) - assert m is not None - order_date = self.locale.parse_date(m.group(1)) + shipment_table = header_table.find_parent('table') - logger.debug('parsing items...') + logger.debug('parsing shipment items...') def is_items_ordered_header(node): if node.name != 'tr': return False tds = node('td') if len(tds) < 2: return False - return (tds[0].text.strip() == self.locale.items_ordered and - tds[1].text.strip() == self.locale.price) + return (tds[0].text.strip() == locale.items_ordered and + tds[1].text.strip() == locale.price) - items_ordered_header = digital_order_table.find(is_items_ordered_header) + items_ordered_header = shipment_table.find(is_items_ordered_header) item_rows = items_ordered_header.find_next_siblings('tr') - items = [] - - other_fields_td = None for item_row in item_rows: tds = item_row('td') - if len(tds) != 2: - other_fields_td = tds[0] - continue description_node = tds[0] price_node = tds[1] price = price_node.text.strip() - a = description_node.find('a') - if a is not None: - description = a.text.strip() - url = a['href'] + if price is None: + price = Amount(D(0), locale.currency) else: - bold_node = description_node.find('b') - description = bold_node.text.strip() - url = None + price = locale.parse_amount(price) - text_lines = self.get_text_lines(description_node) + # 1 of: 365 Everyday Value, Potato Yellow Bag Organic, 48 Ounce + # 2 (1.04 lb) of: Broccoli Crowns Conventional, 1 Each + # 2.07 lb of: Pork Sausage Link Italian Mild Step 1 - def get_label_value(label): - for line in text_lines: - m = re.match(r'^\s*' + label + ': (.*)$', line, - re.UNICODE | re.DOTALL) - if m is None: - continue - return m.group(1) + m = re.match(locale.shipment_quantity, description_node.text, re.UNICODE|re.DOTALL) + quantity = 1 + if m is not None: + # Amazon will say you got, e.g. 2 broccoli crowns at $1.69/lb - but then this code multiplies the 2 by the price listed + # on the invoice, which is the total price in this case (but the per-unit price in other cases) - so if there's a quantity + # and a weight, ignore the quantity and treat it as 1 + # alternately, capture the weight and the per-unit price and multiply out + quantity = m.group("quantity") # ignore quantity for weight items + + if quantity is None: + #print("Unable to extract quantity, using 1: %s" % description_node.text) + quantity = D(1) + else: + quantity = D(quantity) - by = get_label_value(self.locale.by) - sold_by = get_label_value(self.locale.sold_by) + text = description_node.text.split(locale.shipment_of, 1)[1] + m = re.match(locale.shipment_sold_by + locale.shipment_condition, + text, re.UNICODE | re.DOTALL) + if m is None: + m = re.match(locale.shipment_sold_by, text, re.UNICODE | re.DOTALL) + if m is None: + raise Exception("Could not extract item from row", text) + + description = re.sub(r'\s+', ' ', m.group('description').strip()) + sold_by = re.sub(r'\s+', ' ', m.group('sold_by').strip()) + try: + condition = re.sub(r'\s+', ' ', m.group('condition').strip()) + except IndexError: + condition = None + suffix = locale.shipment_seller_profile + if sold_by.endswith(suffix): + sold_by = sold_by[:-len(suffix)] items.append( - DigitalItem( + Item( + quantity=quantity, description=description, - by=by, sold_by=sold_by, - url=url, - price=self.locale.parse_amount(price), + condition=condition, + price=price, )) - - other_fields_text_lines = self.get_text_lines(other_fields_td) - - logger.debug('parsing amounts...') - def get_other_field(pattern, allow_multiple=False, return_label=False): - results = [] - for line in other_fields_text_lines: - r = r'^\s*(' + pattern + r')\s+(.*[^\s])\s*$' - m = re.match(r, line, re.UNICODE) - if m is not None: - results.append((m.group(1).strip(':'), m.group(2))) - if not return_label: - results = [r[1] for r in results] - if not allow_multiple: - if not results: - return None - return results[0] - return results - - def get_adjustments(pattern): - adjustments = [] - for label, amount_str in get_other_field( - pattern, allow_multiple=True, return_label=True): - adjustments.append( - Adjustment(amount=self.locale.parse_amount(amount_str), description=label)) - return adjustments - - def get_amounts_in_text(pattern_map): - amounts = dict() - for key, label in pattern_map.items(): - amount = self.locale.parse_amount(get_other_field(label)) - amounts[key] = amount - return amounts - - items_subtotal = self.locale.parse_amount( - get_other_field(self.locale.items_subtotal_regex)) - total_before_tax = self.locale.parse_amount( - get_other_field(self.locale.total_before_tax_regex)) - tax = get_adjustments(self.locale.tax_collected_digital) - total_for_this_order = self.locale.parse_amount( - get_other_field(self.locale.total_order_digital)) - logger.debug('parsing pretax adjustments...') + logger.debug('parsing shipment amounts...') + items_subtotal = locale.parse_amount( + get_field_in_table(shipment_table, locale.items_subtotal)) + + expected_items_subtotal = reduce_amounts( + beancount.core.amount.mul(x.price, D(x.quantity)) for x in items) + if (items_subtotal is not None and + expected_items_subtotal != items_subtotal): + errors.append( + 'expected items subtotal is %r, but parsed value is %r' % + (expected_items_subtotal, items_subtotal)) + output_fields = dict() - output_fields['pretax_adjustments'] = get_adjustments( - self.locale.pretax_adjustment_fields_pattern) - pretax_parts = ([items_subtotal] + - [a.amount for a in output_fields['pretax_adjustments']]) - logger.debug(pretax_parts) - logger.debug(total_before_tax) - expected_total_before_tax = self.reduce_amounts(pretax_parts) - if expected_total_before_tax != total_before_tax: - errors.append('expected total before tax is %s, but parsed value is %s' - % (expected_total_before_tax, total_before_tax)) - - logger.debug('parsing posttax adjustments...') - output_fields['posttax_adjustments'] = get_adjustments( - self.locale.posttax_adjustment_fields_pattern) - posttax_parts = ([total_before_tax] + [a.amount for a in tax] + - [a.amount for a in output_fields['posttax_adjustments']]) - expected_total = self.reduce_amounts(posttax_parts) - - logger.debug(total_for_this_order) - if expected_total != total_for_this_order: + output_fields['pretax_adjustments'] = get_adjustments_in_table( + shipment_table, locale.pretax_adjustment_fields_pattern, locale=locale) + output_fields['posttax_adjustments'] = get_adjustments_in_table( + shipment_table, locale.posttax_adjustment_fields_pattern, locale=locale) + pretax_parts = [items_subtotal or expected_items_subtotal] + [ + a.amount for a in output_fields['pretax_adjustments'] + ] + total_before_tax = locale.parse_amount( + get_field_in_table(shipment_table, locale.total_before_tax)) + expected_total_before_tax = reduce_amounts(pretax_parts) + if total_before_tax is None: + total_before_tax = expected_total_before_tax + elif expected_total_before_tax != total_before_tax: + errors.append( + 'expected total before tax is %s, but parsed value is %s' % + (expected_total_before_tax, total_before_tax)) + + sales_tax = get_adjustments_in_table(shipment_table, locale.shipment_sales_tax, locale=locale) + + posttax_parts = ( + [total_before_tax] + [a.amount for a in sales_tax] + + [a.amount for a in output_fields['posttax_adjustments']]) + total = locale.parse_amount( + get_field_in_table(shipment_table, locale.shipment_total)) + expected_total = reduce_amounts(posttax_parts) + if total is None: + total = expected_total + elif expected_total != total: errors.append('expected total is %s, but parsed value is %s' % - (expected_total, total_for_this_order)) - - if self.locale.tax_included_in_price: - tax = [] - - shipment = Shipment( - shipped_date=order_date, - items=items, - items_subtotal=items_subtotal, - total_before_tax=total_before_tax, - tax=tax, - total=total_for_this_order, - errors=errors, - **output_fields) - - order_id_td = soup.find( - lambda node: node.name == 'td' and - re.match(self.locale.order_id_digital, node.text.strip()) + (expected_total, total)) + + logger.debug('...finshed parsing shipment') + shipments.append( + Shipment( + shipped_date=shipped_date, + items=items, + items_subtotal=items_subtotal, + total_before_tax=total_before_tax, + tax=sales_tax, + total=total, + errors=errors, + **output_fields)) + + return shipments + + +def parse_credit_card_transactions_from_payments_table( + payment_table, + order_date: datetime.date, + locale=Locale_en_EN()) -> List[CreditCardTransaction]: + """ Parse payment information from payments table. + Only type and last digits are given, no amount (assuming grand total). + Other payment methods than credit card are possible: + - Direct Debit (DE: Bankeinzug) + """ + payment_text = '\n'.join(payment_table.strings) + m = re.search(locale.grand_total, payment_text) + assert m is not None + grand_total = locale.parse_amount(m.group(1).strip()) + + for regex in locale.payment_type: + m = re.search(regex, payment_text) + if m is not None: + break + + if m is not None: + credit_card_transactions = [ + CreditCardTransaction( + date=order_date, + amount=grand_total, + card_description=m.group(1).strip(), + card_ending_in=m.group(2).strip(), ) - m = re.match(self.locale.order_id_digital, order_id_td.text.strip()) + ] + else: + credit_card_transactions = [] + return credit_card_transactions + + +def parse_credit_card_transactions(soup, locale=Locale_en_EN()) -> List[CreditCardTransaction]: + """ Parse Credit Card Transactions from bottom sub-table of payments table. + Transactions are listed with type, 4 digits, transaction date and amount. + """ + def is_header_node(node): + return node.name == 'td' and node.text.strip( + ) == locale.credit_card_transactions + + header_node = soup.find(is_header_node) + if header_node is None: + return [] + sibling = header_node.find_next_sibling('td') + rows = sibling.find_all('tr') + transactions = [] + for row in rows: + if not row.text.strip(): + continue + tds = row('td') + description = tds[0].text.strip() + amount_text = tds[1].text.strip() + m = re.match(locale.credit_card_last_digits, description, + re.UNICODE) assert m is not None - order_id = m.group(1) + transactions.append( + CreditCardTransaction( + date=locale.parse_date(m.group(3)), + card_description=m.group(1), + card_ending_in=m.group(2), + amount=locale.parse_amount(amount_text), + )) + return transactions + + +def parse_invoice(path: str, locale=Locale_en_EN()) -> Optional[Order]: + """ 1st method to call, distinguish between regular and digital invoice. + """ + if os.path.basename(path).startswith('D'): + logger.debug('identified as digital invoice') + return parse_digital_order_invoice(path, locale=locale) + logger.debug('identified as regular invoice') + return parse_regular_order_invoice(path, locale=locale) + + +def parse_regular_order_invoice(path: str, locale=Locale_en_EN()) -> Order: + errors = [] + with open(path, 'rb') as f: + soup = bs4.BeautifulSoup(f.read(), 'lxml') + logger.debug('parsing shipments...') + shipments = parse_shipments(soup, locale=locale) + logger.debug('finished parsing shipments') + logger.debug('parsing payment table...') + payment_table_header = soup.find( + lambda node: node.name == 'table' and re.match( + locale.payment_information, node.text.strip())) + + payment_table = payment_table_header.find_parent('table') + + logger.debug('parsing pretax adjustments...') + output_fields = dict() + output_fields['pretax_adjustments'] = get_adjustments_in_table( + payment_table, locale.pretax_adjustment_fields_pattern, locale=locale) + payment_adjustments = collections.OrderedDict() # type: Dict[str, Amount] + + # older invoices put pre-tax amounts on a per-shipment basis + # new invoices only put pre-tax amounts on the overall payments section + # detect which this is + pretax_amount = reduce_amounts( + a.amount for a in output_fields['pretax_adjustments']) + shipments_pretax_amount = None + + if any(s.pretax_adjustments for s in shipments): + shipments_pretax_amount = reduce_amounts(a.amount + for shipment in shipments + for a in shipment.pretax_adjustments) + + if shipments_pretax_amount != pretax_amount: + errors.append( + 'expected total pretax adjustment to be %s, but parsed total is %s' + % (shipments_pretax_amount, pretax_amount)) + + logger.debug('parsing posttax adjustments...') + payments_total_adjustments = [] + shipments_total_adjustments = [] + + # parse first to get an idea of the working currency + grand_total = locale.parse_amount( + get_field_in_table(payment_table, locale.regular_total_order)) + + def resolve_posttax_adjustments(): + payment_adjustments.update( + reduce_adjustments( + get_adjustments_in_table(payment_table, + locale.posttax_adjustment_fields_pattern, + assumed_currency=grand_total.currency, + locale=locale))) + all_shipments_adjustments = collections.OrderedDict( + reduce_adjustments( + sum((x.posttax_adjustments for x in shipments), []))) + all_keys = collections.OrderedDict(payment_adjustments.items()) + all_keys.update(all_shipments_adjustments.items()) + + all_adjustments = collections.OrderedDict() # type: Dict[str, Amount] + for key in all_keys: + payment_amount = payment_adjustments.get(key) + shipments_amount = all_shipments_adjustments.get(key) + amount = payment_amount + if payment_amount is None and shipments_amount is not None: + # Amazon sometimes doesn't include adjustments in the Payments table + amount = shipments_amount + payments_total_adjustments.append(amount) + elif payment_amount is not None and shipments_amount is None: + # Amazon sometimes doesn't include these adjustments in the Shipment table + shipments_total_adjustments.append(amount) + elif payment_amount != shipments_amount: + errors.append( + 'expected total %r to be %s, but parsed total is %s' % + (key, shipments_amount, payment_amount)) + all_adjustments[key] = amount + return [Adjustment(k, v) for k, v in all_adjustments.items()] + + output_fields['posttax_adjustments'] = resolve_posttax_adjustments() + + logger.debug('consistency check taxes...') + tax = locale.parse_amount( + get_field_in_table(payment_table, locale.regular_estimated_tax)) + + expected_tax = reduce_amounts( + a.amount for shipment in shipments for a in shipment.tax) + if expected_tax is None: + # tax not given on shipment level + if not locale.tax_included_in_price: + # add tax if not already included in item prices + shipments_total_adjustments.append(tax) + elif expected_tax != tax: + errors.append( + 'expected tax is %s, but parsed value is %s' % (expected_tax, tax)) + + if locale.tax_included_in_price: + # tax is already inlcuded in item prices + # do not add additional transaction for taxes + tax = None + + logger.debug('consistency check grand total...') + payments_total_adjustment = reduce_amounts(payments_total_adjustments) + shipments_total_adjustment = reduce_amounts(shipments_total_adjustments) + + expected_total = add_amount(shipments_total_adjustment, + reduce_amounts(x.total for x in shipments)) + + # if no shipments pre-tax section, then the expected total isn't accounting + # for the pre-tax adjustments yet since they are only in the grand total section + if shipments_pretax_amount is None: + expected_total = add_amount(expected_total, pretax_amount) + + adjusted_grand_total = add_amount(payments_total_adjustment, grand_total) + if expected_total != adjusted_grand_total: + errors.append('expected grand total is %s, but parsed value is %s' % + (expected_total, adjusted_grand_total)) + + logger.debug('parsing order placed date...') + def is_order_placed_node(node): + m = re.fullmatch(locale.regular_order_placed, node.text.strip()) + return m is not None + + node = soup.find(is_order_placed_node) + m = re.fullmatch(locale.regular_order_placed, node.text.strip()) + assert m is not None + order_date = locale.parse_date(m.group(1)) + + logger.debug('parsing credit card transactions...') + credit_card_transactions = parse_credit_card_transactions(soup, locale=locale) + if not credit_card_transactions: + logger.debug('no credit card transactions table given, falling back to payments table') + credit_card_transactions = parse_credit_card_transactions_from_payments_table( + payment_table, order_date, locale=locale) + + if credit_card_transactions: + total_payments = reduce_amounts( + x.amount for x in credit_card_transactions) + else: + logger.debug('no payment transactions found, assumig grand total as total payment amount') + total_payments = grand_total + if total_payments != adjusted_grand_total: + errors.append('total payment amount is %s, but grand total is %s' % + (total_payments, adjusted_grand_total)) + + logger.debug('parsing order ID...') + title = soup.find('title').text.strip() + m = re.fullmatch(locale.regular_order_id, title.strip()) + assert m is not None + + logger.debug('...finished parsing regular invoice.') + return Order( + order_date=order_date, + order_id=m.group(1), + shipments=shipments, + credit_card_transactions=credit_card_transactions, + tax=tax, + errors=sum((shipment.errors + for shipment in shipments), cast(Errors, [])) + errors, + **output_fields) + + +def get_text_lines(parent_node): + text_lines = [''] + for node in parent_node.children: + if isinstance(node, bs4.NavigableString): + text_lines[-1] += str(node) + elif node.name == 'br': + text_lines.append('') + else: + text_lines[-1] += node.text + return text_lines - logger.debug('parsing payment information...') - payment_table = soup.find( - lambda node: node.name == 'table' and - node.text.strip().startswith(self.locale.payment_information_digital) - ) - credit_card_transactions = self.parse_credit_card_transactions_from_payments_table( - payment_table, order_date) - logger.debug('...finished') +def parse_digital_order_invoice(path: str, locale=Locale_en_EN()) -> Optional[Order]: + errors = [] + with open(path, 'rb') as f: + soup = bs4.BeautifulSoup(f.read(), 'lxml') - return Order( - order_date=order_date, - order_id=order_id, - shipments=[shipment], - credit_card_transactions=credit_card_transactions, - pretax_adjustments=[], - posttax_adjustments=output_fields['posttax_adjustments'], - tax=None, - errors=[]) + logger.debug('check if order has been cancelled...') + def is_cancelled_order(node): + return node.text.strip() == locale.digital_order_cancelled + + if soup.find(is_cancelled_order): + return None + + logger.debug('parsing header...') + def is_digital_order_row(node): + if node.name != 'tr': + return False + m = re.match(locale.digital_order, node.text.strip()) + if m is None: + return False + try: + locale.parse_date(m.group(1)) + return True + except: + return False + + # Find Digital Order row + digital_order_header = soup.find(is_digital_order_row) + digital_order_table = digital_order_header.find_parent('table') + m = re.match(locale.digital_order, digital_order_header.text.strip()) + assert m is not None + order_date = locale.parse_date(m.group(1)) + + logger.debug('parsing items...') + def is_items_ordered_header(node): + if node.name != 'tr': + return False + tds = node('td') + if len(tds) < 2: + return False + return (tds[0].text.strip() == locale.items_ordered and + tds[1].text.strip() == locale.price) + + items_ordered_header = digital_order_table.find(is_items_ordered_header) + + item_rows = items_ordered_header.find_next_siblings('tr') + items = [] + + other_fields_td = None + + for item_row in item_rows: + tds = item_row('td') + if len(tds) != 2: + other_fields_td = tds[0] + continue + description_node = tds[0] + price_node = tds[1] + price = price_node.text.strip() + + a = description_node.find('a') + if a is not None: + description = a.text.strip() + url = a['href'] + else: + bold_node = description_node.find('b') + description = bold_node.text.strip() + url = None + + text_lines = get_text_lines(description_node) + + def get_label_value(label): + for line in text_lines: + m = re.match(r'^\s*' + label + ': (.*)$', line, + re.UNICODE | re.DOTALL) + if m is None: + continue + return m.group(1) + + by = get_label_value(locale.digital_by) + sold_by = get_label_value(locale.digital_sold_by) + + items.append( + DigitalItem( + description=description, + by=by, + sold_by=sold_by, + url=url, + price=locale.parse_amount(price), + )) + + other_fields_text_lines = get_text_lines(other_fields_td) + + logger.debug('parsing amounts...') + def get_other_field(pattern, allow_multiple=False, return_label=False): + results = [] + for line in other_fields_text_lines: + r = r'^\s*(' + pattern + r')\s+(.*[^\s])\s*$' + m = re.match(r, line, re.UNICODE) + if m is not None: + results.append((m.group(1).strip(':'), m.group(2))) + if not return_label: + results = [r[1] for r in results] + if not allow_multiple: + if not results: + return None + return results[0] + return results + + def get_adjustments(pattern): + adjustments = [] + for label, amount_str in get_other_field( + pattern, allow_multiple=True, return_label=True): + adjustments.append( + Adjustment(amount=locale.parse_amount(amount_str), description=label)) + return adjustments + + def get_amounts_in_text(pattern_map): + amounts = dict() + for key, label in pattern_map.items(): + amount = locale.parse_amount(get_other_field(label)) + amounts[key] = amount + return amounts + + items_subtotal = locale.parse_amount( + get_other_field(locale.items_subtotal)) + total_before_tax = locale.parse_amount( + get_other_field(locale.total_before_tax)) + tax = get_adjustments(locale.digital_tax_collected) + total_for_this_order = locale.parse_amount( + get_other_field(locale.digital_total_order)) + + logger.debug('parsing pretax adjustments...') + output_fields = dict() + output_fields['pretax_adjustments'] = get_adjustments( + locale.pretax_adjustment_fields_pattern) + pretax_parts = ([items_subtotal] + + [a.amount for a in output_fields['pretax_adjustments']]) + logger.debug(pretax_parts) + logger.debug(total_before_tax) + expected_total_before_tax = reduce_amounts(pretax_parts) + if expected_total_before_tax != total_before_tax: + errors.append('expected total before tax is %s, but parsed value is %s' + % (expected_total_before_tax, total_before_tax)) + + logger.debug('parsing posttax adjustments...') + output_fields['posttax_adjustments'] = get_adjustments( + locale.posttax_adjustment_fields_pattern) + posttax_parts = ([total_before_tax] + [a.amount for a in tax] + + [a.amount for a in output_fields['posttax_adjustments']]) + expected_total = reduce_amounts(posttax_parts) + + logger.debug(total_for_this_order) + if expected_total != total_for_this_order: + errors.append('expected total is %s, but parsed value is %s' % + (expected_total, total_for_this_order)) + + if locale.tax_included_in_price: + tax = [] + + shipment = Shipment( + shipped_date=order_date, + items=items, + items_subtotal=items_subtotal, + total_before_tax=total_before_tax, + tax=tax, + total=total_for_this_order, + errors=errors, + **output_fields) + + order_id_td = soup.find( + lambda node: node.name == 'td' and + re.match(locale.digital_order_id, node.text.strip()) + ) + m = re.match(locale.digital_order_id, order_id_td.text.strip()) + assert m is not None + order_id = m.group(1) + + logger.debug('parsing payment information...') + payment_table = soup.find( + lambda node: node.name == 'table' and + node.text.strip().startswith(locale.digital_payment_information) + ) + credit_card_transactions = parse_credit_card_transactions_from_payments_table( + payment_table, order_date, locale=locale) + + logger.debug('...finished parsing digital invoice.') + + return Order( + order_date=order_date, + order_id=order_id, + shipments=[shipment], + credit_card_transactions=credit_card_transactions, + pretax_adjustments=[], + posttax_adjustments=output_fields['posttax_adjustments'], + tax=None, + errors=[]) def main(): @@ -966,18 +995,16 @@ def main(): default=False, action='store_true', help='Output in JSON format.') - # ToDo: add locale argument - # ap.add_argument( - # '--locale', default='EN', help='Local Amazon settings, defaults to EN') + ap.add_argument( + '--locale', default='en_EN', help='Local Amazon settings, defaults to EN') ap.add_argument('paths', nargs='*') - args = ap.parse_args() - amz_inv = AmazonInvoice() + locale = LOCALES[args.locale]() results = [] for path in args.paths: try: - result = amz_inv.parse_invoice(path) + result = parse_invoice(path, locale=locale) results.append(result) except: sys.stderr.write('Error reading: %s\n' % path) diff --git a/beancount_import/source/amazon_invoice_test.py b/beancount_import/source/amazon_invoice_test.py index 6eb8e382..bb2ac06f 100644 --- a/beancount_import/source/amazon_invoice_test.py +++ b/beancount_import/source/amazon_invoice_test.py @@ -19,8 +19,7 @@ ]) def test_parsing(name: str): source_path = os.path.join(testdata_dir, name + '.html') - amz_inv = amazon_invoice.AmazonInvoice() - invoice = amz_inv.parse_invoice(source_path) + invoice = amazon_invoice.parse_invoice(source_path) json_path = os.path.join(testdata_dir, name + '.json') expected = json.load( open(json_path, 'r'), object_pairs_hook=collections.OrderedDict) From b3760b7814cf81ff23ecf652ffe7b0e22591afef Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sat, 18 Dec 2021 14:39:55 +0100 Subject: [PATCH 04/42] updated example docstring with new locale names --- beancount_import/source/amazon.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/beancount_import/source/amazon.py b/beancount_import/source/amazon.py index e7a1b08b..968f5e54 100644 --- a/beancount_import/source/amazon.py +++ b/beancount_import/source/amazon.py @@ -41,7 +41,7 @@ 'Gift Card Amount': 'Assets:Gift-Cards:Amazon', 'Rewards Points': 'Income:Amazon:Cashback', }, - locale='EN' # optional, defaults to 'EN' + locale='en_EN' # optional, defaults to 'en_EN' ) The `amazon_account` key must be specified, and should be set to the email @@ -56,7 +56,7 @@ prediction will likely handle them. The `locale` sets country/language specific settings. -Currently, `EN` and `DE` are available. +Currently, `en_EN` and `de_DE` are available. Specifying credit cards ======================= From 6501b5f7f95161f561ec4c30bcd6f4cf5ead3335 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sat, 18 Dec 2021 20:27:12 +0100 Subject: [PATCH 05/42] added some translations (de_DE) --- beancount_import/source/amazon_invoice.py | 27 ++++++++--------------- 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 930ca243..fcd62e95 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -168,21 +168,14 @@ def __init__(self): # most of translations still missing ... pretax_adjustment_fields_pattern=('(?:' + '|'.join([ 'Verpackung & Versand', - 'Free Shipping', - 'Free delivery', - 'Pantry delivery', + # 'Free Shipping', 'Free delivery', 'Pantry delivery', 'Gutschein eingelöst', # english version not removed yet - 'Promotion(?:s| Applied)', - 'Lightning Deal', - 'Your Coupon Savings', - '[0-9]+% off savings', - 'Subscribe & Save', - '[0-9]+ Audible Credit Applied', - '.*[0-9]+% Off.*', - 'Courtesy Credit', - 'Extra Savings', - '(?:.*) Discount', - 'Gift[ -]Wrap', + 'Geschenkgutschein\(e\)', + # 'Promotion(?:s| Applied)', 'Lightning Deal', + # 'Your Coupon Savings', '[0-9]+% off savings', + # 'Subscribe & Save', '[0-9]+ Audible Credit Applied', + # '.*[0-9]+% Off.*', 'Courtesy Credit', + # 'Extra Savings', '(?:.*) Discount', 'Gift[ -]Wrap', ]) + ') *:'), posttax_adjustment_fields_pattern=r'Gift Card Amount:|Rewards Points:|Tip [(]optional[)]:|Recycle Fee \$X', @@ -210,13 +203,11 @@ def __init__(self): shipment_sales_tax='Anzurechnende MwSt.:', # not sure (only old invoices) shipment_total='Gesamtsumme:', shipment_seller_profile=' (Mitgliedsprofil)', - shipment_sold_by=r'(?P.*)\n\s*(?:Verkauf|Provided) durch:? (?P[^\n]+)', - # Translation missing: Provided by + shipment_sold_by=r'(?P.*)\n\s*(?:Verkauf) durch:? (?P[^\n]+)', shipment_condition=r'\n.*\n\s*Zustand: (?P[^\n]+)', regular_total_order='Gesamtsumme:', regular_estimated_tax='Anzurechnende MwSt.:', - regular_order_placed=r'(?:Subscribe and Save )?Bestellung aufgegeben am:\s+(\d+\. [^\s]+ \d{4})', - # Translation missing: Subscribe and Save -> Sparabo?? + regular_order_placed=r'(?:Getätigte Spar-Abo-Bestellung|Bestellung aufgegeben am):\s+(\d+\. [^\s]+ \d{4})', regular_order_id=r'.*Bestellung ([0-9\-]+)', # digital orders only From c1c8af81b08c0d4ed54d1ee815e3b8c29371f377 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sat, 18 Dec 2021 20:34:20 +0100 Subject: [PATCH 06/42] add ability to parse prepended 3 letter currency specification, e.g. USD 1.99 --- beancount_import/amount_parsing.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/beancount_import/amount_parsing.py b/beancount_import/amount_parsing.py index 79e25c2c..213d8539 100644 --- a/beancount_import/amount_parsing.py +++ b/beancount_import/amount_parsing.py @@ -30,12 +30,18 @@ def parse_amount(x, assumed_currency=None): if not x: return None sign, amount_str = parse_possible_negative(x) - m = re.fullmatch(r'(?:[(][^)]+[)])?\s*([\$€£])?((?:[0-9](?:,?[0-9])*|(?=\.))(?:\.[0-9]+)?)(?:\s+([A-Z]{3}))?', amount_str) + m = re.fullmatch(r'(?:[(][^)]+[)])?\s*([\$€£]|[A-Z]{3})?\s*((?:[0-9](?:,?[0-9])*|(?=\.))(?:\.[0-9]+)?)(?:\s+([A-Z]{3}))?', amount_str) if m is None: raise ValueError('Failed to parse amount from %r' % amount_str) if m.group(1): - currency = {'$': 'USD', '€': 'EUR', '£': 'GBP'}[m.group(1)] + # unit before amount + if len(m.group(1)) == 3: + # 'EUR' or 'USD' + currency = m.group(1) + else: + currency = {'$': 'USD', '€': 'EUR', '£': 'GBP'}[m.group(1)] elif m.group(3): + # unit after amount currency = m.group(3) elif assumed_currency is not None: currency = assumed_currency From e9a9c953f7f7bc4767d3d60fd67f9be20a98a45e Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sat, 18 Dec 2021 22:16:48 +0100 Subject: [PATCH 07/42] updated invoice sanitizer to not completely remove payment table for digital invoices (de_DE) --- .../source/amazon_invoice_sanitize.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/beancount_import/source/amazon_invoice_sanitize.py b/beancount_import/source/amazon_invoice_sanitize.py index 6a2da515..757e02a5 100644 --- a/beancount_import/source/amazon_invoice_sanitize.py +++ b/beancount_import/source/amazon_invoice_sanitize.py @@ -51,16 +51,27 @@ def get_replacement(m): def sanitize_credit_card(contents: str, new_digits: str): + # en_EN contents = re.sub(r'(ending in\s+)[0-9]{4}', lambda m: m.group(1) + new_digits, contents) contents = re.sub(r'(Last (?:[a-zA-Z0-9\s]*)digits:\s*)[0-9]{4}', lambda m: m.group(1) + new_digits, contents) + # de_DE + contents = re.sub(r'(mit den Endziffern\s+)[0-9]{4}', + lambda m: m.group(1) + new_digits, contents) + contents = re.sub(r'(Die letzten(?:[a-zA-Z0-9\s]*)Ziffern:\s*)[0-9]{4}', + lambda m: m.group(1) + new_digits, contents) return contents def sanitize_address(contents: str): - return re.sub( - '^.*address.*$', '', contents, flags=re.IGNORECASE | re.MULTILINE) + contents = re.sub( + '^.*displayaddress.*$', '', contents, flags=re.IGNORECASE | re.MULTILINE) + + # some invoices have shipping address given in payment table in different format (e.g. de_DE digital) + contents = re.sub( + r'
  • .*<\/ul>', '', contents) + return contents def remove_tag(soup: bs4.BeautifulSoup, tag: str): From 03b9a06ac068b1d0b4c587ae942e52028dc2a4d4 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sat, 18 Dec 2021 22:18:47 +0100 Subject: [PATCH 08/42] add some sanitized invoice tests for de_DE --- .../source/amazon_invoice_test.py | 25 +- .../amazon/de_DE/256-0244967-2403944.html | 327 ++++++++++ .../amazon/de_DE/256-0244967-2403944.json | 57 ++ .../amazon/de_DE/393-2608279-9292916.html | 361 +++++++++++ .../amazon/de_DE/393-2608279-9292916.json | 71 +++ .../amazon/de_DE/898-5185906-0096901.html | 380 ++++++++++++ .../amazon/de_DE/898-5185906-0096901.json | 64 ++ .../amazon/de_DE/974-6135682-9358749.html | 586 ++++++++++++++++++ .../amazon/de_DE/974-6135682-9358749.json | 131 ++++ .../amazon/de_DE/D22-9220967-2566135.html | 233 +++++++ .../amazon/de_DE/D22-9220967-2566135.json | 35 ++ .../amazon/de_DE/D60-9825125-4795642.html | 233 +++++++ .../amazon/de_DE/D60-9825125-4795642.json | 35 ++ 13 files changed, 2537 insertions(+), 1 deletion(-) create mode 100644 testdata/source/amazon/de_DE/256-0244967-2403944.html create mode 100644 testdata/source/amazon/de_DE/256-0244967-2403944.json create mode 100644 testdata/source/amazon/de_DE/393-2608279-9292916.html create mode 100644 testdata/source/amazon/de_DE/393-2608279-9292916.json create mode 100644 testdata/source/amazon/de_DE/898-5185906-0096901.html create mode 100644 testdata/source/amazon/de_DE/898-5185906-0096901.json create mode 100644 testdata/source/amazon/de_DE/974-6135682-9358749.html create mode 100644 testdata/source/amazon/de_DE/974-6135682-9358749.json create mode 100644 testdata/source/amazon/de_DE/D22-9220967-2566135.html create mode 100644 testdata/source/amazon/de_DE/D22-9220967-2566135.json create mode 100644 testdata/source/amazon/de_DE/D60-9825125-4795642.html create mode 100644 testdata/source/amazon/de_DE/D60-9825125-4795642.json diff --git a/beancount_import/source/amazon_invoice_test.py b/beancount_import/source/amazon_invoice_test.py index bb2ac06f..98326b82 100644 --- a/beancount_import/source/amazon_invoice_test.py +++ b/beancount_import/source/amazon_invoice_test.py @@ -17,7 +17,7 @@ '166-7926740-5141621', 'D56-5204779-4181560', ]) -def test_parsing(name: str): +def test_parsing_en_EN(name: str): source_path = os.path.join(testdata_dir, name + '.html') invoice = amazon_invoice.parse_invoice(source_path) json_path = os.path.join(testdata_dir, name + '.json') @@ -29,3 +29,26 @@ def test_parsing(name: str): if expected_str != actual_str: print(actual_str) assert expected_str == actual_str + + +@pytest.mark.parametrize('name', [ + '256-0244967-2403944', + '393-2608279-9292916', + '898-5185906-0096901', + '974-6135682-9358749', + 'D22-9220967-2566135', + 'D60-9825125-4795642' +]) +def test_parsing_de_DE(name: str): + testdata_dir_locale = os.path.join(testdata_dir, 'de_DE') + source_path = os.path.join(testdata_dir_locale, name + '.html') + invoice = amazon_invoice.parse_invoice(source_path, locale=amazon_invoice.LOCALES['de_DE']()) + json_path = os.path.join(testdata_dir_locale, name + '.json') + expected = json.load( + open(json_path, 'r'), object_pairs_hook=collections.OrderedDict) + expected_str = json.dumps(expected, indent=4) + actual = amazon_invoice.to_json(invoice) + actual_str = json.dumps(actual, indent=4) + if expected_str != actual_str: + print(actual_str) + assert expected_str == actual_str diff --git a/testdata/source/amazon/de_DE/256-0244967-2403944.html b/testdata/source/amazon/de_DE/256-0244967-2403944.html new file mode 100644 index 00000000..ed841e72 --- /dev/null +++ b/testdata/source/amazon/de_DE/256-0244967-2403944.html @@ -0,0 +1,327 @@ + + + + + + +Amazon.de - Bestellung 256-0244967-2403944 + + + + +
    +
    +
    + Übersicht zur Bestellung #256-0244967-2403944 +
    +Bitte drucken Sie diese Seite aus und legen Sie sie zu Ihren Unterlagen. +

    + + + + +
    + + + + + +
    + + Bestellung aufgegeben am: + + 27. September 2021 +
    +Bestellnummer: + 256-0244967-2403944 +
    +Bestellnummer seitens des Verkäufers: + 9254259 +
    +Gesamtbestellwert: + EUR 23,96 +
    +
    + + + + +
    + + + + + + + + + + +
    + + + + +
    +
    + versandt am 28. September 2021 +
    +
    +
    + + + + +
    + + + + +
    +   +
    + + + + + + + + + +
    +Bestellte Artikel + +Preis +
    + + 1 + + Exemplar(e) von: + + FC Bayern München Cuddly fleece blanket 150 x 200 cm
    + + Verkauf durch: Offizieller FC Bayern Store (Mitgliedsprofil) + + + + + + + + + + + + + + + + + + +
    +
    + + Zustand: Neu
    +
    +
    +EUR 23,96
    +
    +
    +
    +
    + + + + + +
    + +Versandadresse + +
    + + + + + + + + + +
    +Versandart: + +
    +Standardversand +
    +
    + +
    +
    +
    +
    +
    + + + + +
    + + + + + + + + + + +
    + + + + +
    +
    Zahlungsdaten
    +
    +
    + + + + +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + +
    Zwischensumme:EUR 23,96
    Verpackung & Versand:EUR 0,00
     -----
    Summe:EUR 23,96
     -----
    Gesamtsumme: EUR 23,96
    +
    +Zahlungsart: +
    + + + Visa / Electron + | Die letzten Ziffern: 1234 +
    +
    +Rechnungsadresse: + + + + + + + + + +
    +
    + + + + + +
    +
    Kreditkarten-Transaktionen 
    +
    + + + + + +
    + Visa mit den Endziffern 1234: 28. September 2021: + +EUR 23,96 +
    +
    +
    +
    +
    +
    +

    Um den Status Ihrer Bestellung einzusehen, kehren Sie auf Bestellungsübersicht zurück.

    +

    Hinweis: Dies ist keine Rechnung.

    +
    + +
    + + + +
    + +Unsere AGB |  + + + + + + +Datenschutzerklärung |  + + + + + + +Impressum  © 7744-6638, Amazon.com, Inc. und Tochtergesellschaften +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/256-0244967-2403944.json b/testdata/source/amazon/de_DE/256-0244967-2403944.json new file mode 100644 index 00000000..eaaa047d --- /dev/null +++ b/testdata/source/amazon/de_DE/256-0244967-2403944.json @@ -0,0 +1,57 @@ +{ + "order_id": "256-0244967-2403944", + "order_date": "2021-09-27", + "shipments": [ + { + "shipped_date": "2021-09-28", + "items": [ + { + "quantity": "1", + "description": "FC Bayern M\u00fcnchen Cuddly fleece blanket 150 x 200 cm", + "sold_by": "Offizieller FC Bayern Store", + "condition": "Neu", + "price": { + "number": "23.96", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": { + "number": "23.96", + "currency": "EUR" + }, + "posttax_adjustments": [], + "tax": [], + "total": { + "number": "23.96", + "currency": "EUR" + }, + "errors": [] + } + ], + "credit_card_transactions": [ + { + "date": "2021-09-28", + "card_description": "Visa", + "card_ending_in": "1234", + "amount": { + "number": "23.96", + "currency": "EUR" + } + } + ], + "pretax_adjustments": [ + { + "description": "Verpackung & Versand", + "amount": { + "number": "0.00", + "currency": "EUR" + } + } + ], + "tax": null, + "posttax_adjustments": [], + "errors": [] +} \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/393-2608279-9292916.html b/testdata/source/amazon/de_DE/393-2608279-9292916.html new file mode 100644 index 00000000..79859219 --- /dev/null +++ b/testdata/source/amazon/de_DE/393-2608279-9292916.html @@ -0,0 +1,361 @@ + + + + + + +Amazon.de - Bestellung 393-2608279-9292916 + + + + +
    +
    +
    + Übersicht zur Bestellung #393-2608279-9292916 +
    +Bitte drucken Sie diese Seite aus und legen Sie sie zu Ihren Unterlagen. +

    + + + + +
    + + + + + + + +
    + + Getätigte Spar-Abo-Bestellung: + + 15. Mai 2018 +
    +Bestellnummer: + 393-2608279-9292916 +
    +Gesamtbestellwert: + EUR 0,00 +
    + + Diese Bestellung enthält Abonnieren-und-Sparen-Artikel. + +
    +
    + + + + +
    + + + + + + + + + + +
    + + + + +
    +
    + versandt am 8. Juni 2018 +
    +
    +
    + + + + +
    + + + + +
    +   +
    + + + + + + + + + +
    +Bestellte Artikel + +Preis +
    + + 1 + + Exemplar(e) von: + + Lavazza Caffè Decaffeinato, 2er Pack (2 x 500 g Packung)
    + + Verkauf durch: Amazon EU S.a.r.L. + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + Zustand: Neu
    +
    +
    +EUR 16,98
    +
    +
    +
    +
    + + + + + +
    + +Versandadresse + +
    + + + + + + + + + +
    +Versandart: + +
    +Standard-Versand +
    +
    + +
    +
    +
    +
    +
    + + + + +
    + + + + + + + +
    + + + + +
    +
    Zahlungsdaten
    +
    +
    + + + + +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Zwischensumme:EUR 15,87
    Verpackung & Versand:EUR 0,00
     -----
    Summe ohne MwSt.:EUR 15,87
    Anzurechnende MwSt.:EUR 1,11
     -----
    Summe:EUR 16,98
    Gutschein eingelöst:-EUR 0,85
    Geschenkgutschein(e):-EUR 16,13
     -----
    Gesamtsumme: EUR 0,00
    +
    +Zahlungsart: +
    + + + Visa / Electron + | Die letzten Ziffern: 1234 +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + Geschenkgutschein
    +
    +Rechnungsadresse: + + + + + + + + + +
    +
    +
    +
    +
    +

    Um den Status Ihrer Bestellung einzusehen, kehren Sie auf Bestellungsübersicht zurück.

    +

    Hinweis: Dies ist keine Rechnung.

    +

    +
    + +
    + + + +
    + +Unsere AGB |  + + + + + + +Datenschutzerklärung |  + + + + + + +Impressum  © 9585-1942, Amazon.com, Inc. und Tochtergesellschaften +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/393-2608279-9292916.json b/testdata/source/amazon/de_DE/393-2608279-9292916.json new file mode 100644 index 00000000..3872b61b --- /dev/null +++ b/testdata/source/amazon/de_DE/393-2608279-9292916.json @@ -0,0 +1,71 @@ +{ + "order_id": "393-2608279-9292916", + "order_date": "2018-05-15", + "shipments": [ + { + "shipped_date": "2018-06-08", + "items": [ + { + "quantity": "1", + "description": "Lavazza Caff\u00e8 Decaffeinato, 2er Pack (2 x 500 g Packung)", + "sold_by": "Amazon EU S.a.r.L.", + "condition": "Neu", + "price": { + "number": "16.98", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": { + "number": "16.98", + "currency": "EUR" + }, + "posttax_adjustments": [], + "tax": [], + "total": { + "number": "16.98", + "currency": "EUR" + }, + "errors": [] + } + ], + "credit_card_transactions": [ + { + "date": "2018-05-15", + "card_description": "Visa / Electron", + "card_ending_in": "1234", + "amount": { + "number": "0.00", + "currency": "EUR" + } + } + ], + "pretax_adjustments": [ + { + "description": "Verpackung & Versand", + "amount": { + "number": "0.00", + "currency": "EUR" + } + }, + { + "description": "Gutschein eingel\u00f6st", + "amount": { + "number": "-0.85", + "currency": "EUR" + } + }, + { + "description": "Geschenkgutschein(e)", + "amount": { + "number": "-16.13", + "currency": "EUR" + } + } + ], + "tax": null, + "posttax_adjustments": [], + "errors": [] +} \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/898-5185906-0096901.html b/testdata/source/amazon/de_DE/898-5185906-0096901.html new file mode 100644 index 00000000..076bd674 --- /dev/null +++ b/testdata/source/amazon/de_DE/898-5185906-0096901.html @@ -0,0 +1,380 @@ + + + + + + +Amazon.de - Bestellung 898-5185906-0096901 + + + + +
    +
    +
    + Übersicht zur Bestellung #898-5185906-0096901 +
    +Bitte drucken Sie diese Seite aus und legen Sie sie zu Ihren Unterlagen. +

    + + + + +
    + + + + + + + +
    + + Getätigte Spar-Abo-Bestellung: + + 12. März 2018 +
    +Bestellnummer: + 898-5185906-0096901 +
    +Gesamtbestellwert: + EUR 15,75 +
    + + Diese Bestellung enthält Abonnieren-und-Sparen-Artikel. + +
    +
    + + + + +
    + + + + + + + + + + +
    + + + + +
    +
    + versandt am 9. April 2018 +
    +
    +
    + + + + +
    + + + + +
    +   +
    + + + + + + + + + +
    +Bestellte Artikel + +Preis +
    + + 1 + + Exemplar(e) von: + + Lavazza Caffè Decaffeinato, 2er Pack (2 x 500 g Packung)
    + + Verkauf durch: Amazon EU S.a.r.L. + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + Zustand: Neu
    +
    +
    +EUR 16,58
    +
    +
    +
    +
    + + + + + +
    + +Versandadresse + +
    + + + + + + + + + +
    +Versandart: + +
    +Standard-Versand +
    +
    + +
    +
    +
    +
    +
    + + + + +
    + + + + + + + + + + +
    + + + + +
    +
    Zahlungsdaten
    +
    +
    + + + + +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Zwischensumme:EUR 15,50
    Verpackung & Versand:EUR 0,00
     -----
    Summe ohne MwSt.:EUR 15,50
    Anzurechnende MwSt.:EUR 1,08
     -----
    Summe:EUR 16,58
    Gutschein eingelöst:-EUR 0,83
     -----
    Gesamtsumme: EUR 15,75
    +
    +Zahlungsart: +
    + + + Visa / Electron + | Die letzten Ziffern: 1234 +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + Geschenkgutschein
    +
    +Rechnungsadresse: + + + + + + + + + +
    +
    + + + + + +
    +
    Kreditkarten-Transaktionen 
    +
    + + + + + +
    + Visa mit den Endziffern 1234: 9. April 2018: + +EUR 15,75 +
    +
    +
    +
    +
    +
    +

    Um den Status Ihrer Bestellung einzusehen, kehren Sie auf Bestellungsübersicht zurück.

    +

    Hinweis: Dies ist keine Rechnung.

    +

    +
    + +
    + + + +
    + +Unsere AGB |  + + + + + + +Datenschutzerklärung |  + + + + + + +Impressum  © 8399-2848, Amazon.com, Inc. und Tochtergesellschaften +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/898-5185906-0096901.json b/testdata/source/amazon/de_DE/898-5185906-0096901.json new file mode 100644 index 00000000..26bd89f6 --- /dev/null +++ b/testdata/source/amazon/de_DE/898-5185906-0096901.json @@ -0,0 +1,64 @@ +{ + "order_id": "898-5185906-0096901", + "order_date": "2018-03-12", + "shipments": [ + { + "shipped_date": "2018-04-09", + "items": [ + { + "quantity": "1", + "description": "Lavazza Caff\u00e8 Decaffeinato, 2er Pack (2 x 500 g Packung)", + "sold_by": "Amazon EU S.a.r.L.", + "condition": "Neu", + "price": { + "number": "16.58", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": { + "number": "16.58", + "currency": "EUR" + }, + "posttax_adjustments": [], + "tax": [], + "total": { + "number": "16.58", + "currency": "EUR" + }, + "errors": [] + } + ], + "credit_card_transactions": [ + { + "date": "2018-04-09", + "card_description": "Visa", + "card_ending_in": "1234", + "amount": { + "number": "15.75", + "currency": "EUR" + } + } + ], + "pretax_adjustments": [ + { + "description": "Verpackung & Versand", + "amount": { + "number": "0.00", + "currency": "EUR" + } + }, + { + "description": "Gutschein eingel\u00f6st", + "amount": { + "number": "-0.83", + "currency": "EUR" + } + } + ], + "tax": null, + "posttax_adjustments": [], + "errors": [] +} \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/974-6135682-9358749.html b/testdata/source/amazon/de_DE/974-6135682-9358749.html new file mode 100644 index 00000000..3bc8222f --- /dev/null +++ b/testdata/source/amazon/de_DE/974-6135682-9358749.html @@ -0,0 +1,586 @@ + + + + + + +Amazon.de - Bestellung 974-6135682-9358749 + + + + +
    +
    +
    + Übersicht zur Bestellung #974-6135682-9358749 +
    +Bitte drucken Sie diese Seite aus und legen Sie sie zu Ihren Unterlagen. +

    + + + + +
    + + + + +
    + + Bestellung aufgegeben am: + + 20. September 2021 +
    +Bestellnummer: + 974-6135682-9358749 +
    +Gesamtbestellwert: + EUR 33,66 +
    +
    + + + + +
    + + + + + + + + + + +
    + + + + +
    +
    + versandt am 20. September 2021 +
    +
    +
    + + + + +
    + + + + +
    +   +
    + + + + + + + + + +
    +Bestellte Artikel + +Preis +
    + + 1 + + Exemplar(e) von: + + Die kleine Kees de Kort-Kinderbibel (Was uns die Bibel erzählt. Neue Serie), Kees de Kort
    + + Verkauf durch: Amazon EU S.a.r.L. + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + Zustand: Neu
    +
    +
    +EUR 13,00
    +
    +
    +
    +
    + + + + + +
    + +Versandadresse + +
    + + + + + + + + + +
    +Versandart: + +
    +Premiumversand +
    +
    + +
    +
    +
    +
    +
    + + + + +
    + + + + + + + + + + +
    + + + + +
    +
    + versandt am 20. September 2021 +
    +
    +
    + + + + +
    + + + + +
    +   +
    + + + + + + + + + +
    +Bestellte Artikel + +Preis +
    + + 1 + + Exemplar(e) von: + + Schwungübungen Ab 3 Jahren: Übungsheft Mit Schwungübungen Zur Erhöhung Der Konzentration, Augen-Hand-Koordination Und Feinmotorik. Ideale Vorberei, Eichelberger, Laura
    + + Verkauf durch: Amazon EU S.a.r.L. + + + + + + + + + + + + +
    +
    + + Zustand: Neu
    +
    +
    +EUR 5,95
    +
    +
    +
    +
    + + + + + +
    + +Versandadresse + +
    + + + + + + + + + +
    +Versandart: + +
    +Premiumversand +
    +
    + +
    +
    +
    +
    +
    + + + + +
    + + + + + + + + + + +
    + + + + +
    +
    + versandt am 20. September 2021 +
    +
    +
    + + + + +
    + + + + +
    +   +
    + + + + + + + + + +
    +Bestellte Artikel + +Preis +
    + + 1 + + Exemplar(e) von: + + Pelikan 723122 Mini Friends 755/8 Multi-Coloured Paint Box with 8 Colours and Brushes, Paint palette, multicoloured
    + + Verkauf durch: Amazon EU S.a.r.L. + + + + + + + + + + + + +
    +
    + + Zustand: Neu
    +
    +
    +EUR 14,71
    +
    +
    +
    +
    + + + + + +
    + +Versandadresse + +
    + + + + + + + + + +
    +Versandart: + +
    +Premiumversand +
    +
    + +
    +
    +
    +
    +
    + + + + +
    + + + + + + + + + + +
    + + + + +
    +
    Zahlungsdaten
    +
    +
    + + + + +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Zwischensumme:EUR 30,07
    Verpackung & Versand:EUR 0,00
     -----
    Summe ohne MwSt.:EUR 30,07
    Anzurechnende MwSt.:EUR 3,59
     -----
    Summe:EUR 33,66
     -----
    Gesamtsumme: EUR 33,66
    +
    +Zahlungsart: +
    + + + Visa / Electron + | Die letzten Ziffern: 1234 +
    +
    +Rechnungsadresse: + + + + + + + + + +
    +
    + + + + + +
    +
    Kreditkarten-Transaktionen 
    +
    + + + + + + + + + + + + + +
    + Visa mit den Endziffern 1234: 20. September 2021: + +EUR 5,95 +
    + Visa mit den Endziffern 1234: 20. September 2021: + +EUR 13,00 +
    + Visa mit den Endziffern 1234: 20. September 2021: + +EUR 14,71 +
    +
    +
    +
    +
    +
    +

    Um den Status Ihrer Bestellung einzusehen, kehren Sie auf Bestellungsübersicht zurück.

    +

    Hinweis: Dies ist keine Rechnung.

    +
    + +
    + + + +
    + +Unsere AGB |  + + + + + + +Datenschutzerklärung |  + + + + + + +Impressum  © 0470-8920, Amazon.com, Inc. und Tochtergesellschaften +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/974-6135682-9358749.json b/testdata/source/amazon/de_DE/974-6135682-9358749.json new file mode 100644 index 00000000..23e63ce1 --- /dev/null +++ b/testdata/source/amazon/de_DE/974-6135682-9358749.json @@ -0,0 +1,131 @@ +{ + "order_id": "974-6135682-9358749", + "order_date": "2021-09-20", + "shipments": [ + { + "shipped_date": "2021-09-20", + "items": [ + { + "quantity": "1", + "description": "Die kleine Kees de Kort-Kinderbibel (Was uns die Bibel erz\u00e4hlt. Neue Serie), Kees de Kort", + "sold_by": "Amazon EU S.a.r.L.", + "condition": "Neu", + "price": { + "number": "13.00", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": { + "number": "13.00", + "currency": "EUR" + }, + "posttax_adjustments": [], + "tax": [], + "total": { + "number": "13.00", + "currency": "EUR" + }, + "errors": [] + }, + { + "shipped_date": "2021-09-20", + "items": [ + { + "quantity": "1", + "description": "Schwung\u00fcbungen Ab 3 Jahren: \u00dcbungsheft Mit Schwung\u00fcbungen Zur Erh\u00f6hung Der Konzentration, Augen-Hand-Koordination Und Feinmotorik. Ideale Vorberei, Eichelberger, Laura", + "sold_by": "Amazon EU S.a.r.L.", + "condition": "Neu", + "price": { + "number": "5.95", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": { + "number": "5.95", + "currency": "EUR" + }, + "posttax_adjustments": [], + "tax": [], + "total": { + "number": "5.95", + "currency": "EUR" + }, + "errors": [] + }, + { + "shipped_date": "2021-09-20", + "items": [ + { + "quantity": "1", + "description": "Pelikan 723122 Mini Friends 755/8 Multi-Coloured Paint Box with 8 Colours and Brushes, Paint palette, multicoloured", + "sold_by": "Amazon EU S.a.r.L.", + "condition": "Neu", + "price": { + "number": "14.71", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": { + "number": "14.71", + "currency": "EUR" + }, + "posttax_adjustments": [], + "tax": [], + "total": { + "number": "14.71", + "currency": "EUR" + }, + "errors": [] + } + ], + "credit_card_transactions": [ + { + "date": "2021-09-20", + "card_description": "Visa", + "card_ending_in": "1234", + "amount": { + "number": "5.95", + "currency": "EUR" + } + }, + { + "date": "2021-09-20", + "card_description": "Visa", + "card_ending_in": "1234", + "amount": { + "number": "13.00", + "currency": "EUR" + } + }, + { + "date": "2021-09-20", + "card_description": "Visa", + "card_ending_in": "1234", + "amount": { + "number": "14.71", + "currency": "EUR" + } + } + ], + "pretax_adjustments": [ + { + "description": "Verpackung & Versand", + "amount": { + "number": "0.00", + "currency": "EUR" + } + } + ], + "tax": null, + "posttax_adjustments": [], + "errors": [] +} \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/D22-9220967-2566135.html b/testdata/source/amazon/de_DE/D22-9220967-2566135.html new file mode 100644 index 00000000..0bf5ca35 --- /dev/null +++ b/testdata/source/amazon/de_DE/D22-9220967-2566135.html @@ -0,0 +1,233 @@ + + + + +Amazon.de: Übersicht - Digitale Bestellung + + + + + +
    + + + +

    +
    + + Details zur Bestellung # D22-9220967-2566135 + +
    +Drucken Sie diese Seite für Ihre Unterlagen aus. + +
    +
    +
    + + + + + + + + +
    +Amazon.de Bestellnummer: D22-9220967-2566135 +
    +Summe der Bestellung: EUR 9,95 +

    + + + + + + + +
    +Digitale Bestellung: 31 Mai 2020 +
    + + + + +
    + + + + + + + + +
    +Bestellte Artikel
    +
    +Preis +
    +Audible Flexi-Abo [Digitales Abo][Hörbuch]
    +
    +Verkauft von: Audible

    EUR 9,95
    + + + Zwischensumme Artikel: EUR 9,30
    + + + + + ----
    + + + + + +Gesamtbetrag: EUR 9,30
    MwSt: EUR 0,65
    + ----
    +Gesamtbetrag für diese Bestellung: EUR 9,95
    +
    +
    +
    +
    +
    + + + + + + + +
    +Zahlungsinformation +
    + + + + +
    + + +
    Zahlungsarten
    • endet im 3044
    Rechnungsadresse
    Zwischensumme Artikel:
    EUR 9,30

    Gesamtbetrag:
    EUR 9,30
    MwSt:
    EUR 0,65


    Endsumme:
    EUR 9,95
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    +
    +

    + Zurück zu Bestellungsübersicht. +

    +

    Anmerkung: Dies ist keine Rechnung mit Mehrwertsteuer.

    +

    +

    +
    +
    + + + +
    + +Unsere AGB |  + + + + + + +Datenschutzerklärung |  + + + + + + +Impressum  © 2841-0341, Amazon.com, Inc. und Tochtergesellschaften +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/D22-9220967-2566135.json b/testdata/source/amazon/de_DE/D22-9220967-2566135.json new file mode 100644 index 00000000..5c7a6da1 --- /dev/null +++ b/testdata/source/amazon/de_DE/D22-9220967-2566135.json @@ -0,0 +1,35 @@ +{ + "order_id": "D22-9220967-2566135", + "order_date": "2020-05-31", + "shipments": [ + { + "shipped_date": "2020-05-31", + "items": [ + { + "description": "Audible Flexi-Abo [Digitales Abo]", + "url": "https://www.amazon.de/dp/B08H5XW8SJ/ref=docs-os-doi_0", + "sold_by": "Audible", + "by": null, + "price": { + "number": "9.95", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": null, + "posttax_adjustments": [], + "tax": [], + "total": null, + "errors": [ + "expected total is 0.65 EUR, but parsed value is None" + ] + } + ], + "credit_card_transactions": [], + "pretax_adjustments": [], + "tax": null, + "posttax_adjustments": [], + "errors": [] +} \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/D60-9825125-4795642.html b/testdata/source/amazon/de_DE/D60-9825125-4795642.html new file mode 100644 index 00000000..7eadad54 --- /dev/null +++ b/testdata/source/amazon/de_DE/D60-9825125-4795642.html @@ -0,0 +1,233 @@ + + + + +Amazon.de: Übersicht - Digitale Bestellung + + + + + +
    + + + +

    +
    + + Details zur Bestellung # D60-9825125-4795642 + +
    +Drucken Sie diese Seite für Ihre Unterlagen aus. + +
    +
    +
    + + + + + + + + +
    +Amazon.de Bestellnummer: D60-9825125-4795642 +
    +Summe der Bestellung: EUR 4,98 +

    + + + + + + + +
    +Digitale Bestellung: 22 Dezember 2019 +
    + + + + +
    + + + + + + + + +
    +Bestellte Artikel
    +
    +Preis +
    +Das erstaunliche Leben des Walter Mitty [dt./OV][Prime Video]
    Von: Ben Stiller, Kristen Wiig, Jon Daly
    +
    +Verkauft von: Amazon Digital Germany GmbH

    EUR 4,98
    + + + Zwischensumme Artikel: EUR 4,18
    + + + + + ----
    + + + + + +Gesamtbetrag: EUR 4,18
    MwSt: EUR 0,80
    + ----
    +Gesamtbetrag für diese Bestellung: EUR 4,98
    +
    +
    +
    +
    +
    + + + + + + + +
    +Zahlungsinformation +
    + + + + +
    + + +
    Zahlungsarten
    • endet im 3044
    Rechnungsadresse
    Zwischensumme Artikel:
    EUR 4,18

    Gesamtbetrag:
    EUR 4,18
    MwSt:
    EUR 0,80


    Endsumme:
    EUR 4,98
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    +
    +

    + Zurück zu Bestellungsübersicht. +

    +

    Anmerkung: Dies ist keine Rechnung mit Mehrwertsteuer.

    +

    +

    +
    +
    + + + +
    + +Unsere AGB |  + + + + + + +Datenschutzerklärung |  + + + + + + +Impressum  © 4153-1084, Amazon.com, Inc. und Tochtergesellschaften +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/D60-9825125-4795642.json b/testdata/source/amazon/de_DE/D60-9825125-4795642.json new file mode 100644 index 00000000..82072efa --- /dev/null +++ b/testdata/source/amazon/de_DE/D60-9825125-4795642.json @@ -0,0 +1,35 @@ +{ + "order_id": "D60-9825125-4795642", + "order_date": "2019-12-22", + "shipments": [ + { + "shipped_date": "2019-12-22", + "items": [ + { + "description": "Das erstaunliche Leben des Walter Mitty [dt./OV]", + "url": "https://www.amazon.de/dp/B00JZPPGNC/ref=docs-os-doi_0", + "sold_by": "Amazon Digital Germany GmbH", + "by": "Ben Stiller, Kristen Wiig, Jon Daly", + "price": { + "number": "4.98", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": null, + "posttax_adjustments": [], + "tax": [], + "total": null, + "errors": [ + "expected total is 0.80 EUR, but parsed value is None" + ] + } + ], + "credit_card_transactions": [], + "pretax_adjustments": [], + "tax": null, + "posttax_adjustments": [], + "errors": [] +} \ No newline at end of file From 02cb706f3e416a2ef782bfe0f40a157659c92cd4 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 19 Dec 2021 07:58:50 +0100 Subject: [PATCH 09/42] change invoice.tax from None to [] to match convention of other JSON fields --- beancount_import/source/amazon.py | 2 +- beancount_import/source/amazon_invoice.py | 4 ++-- testdata/source/amazon/D56-5204779-4181560.json | 2 +- testdata/source/amazon/de_DE/256-0244967-2403944.json | 2 +- testdata/source/amazon/de_DE/393-2608279-9292916.json | 2 +- testdata/source/amazon/de_DE/898-5185906-0096901.json | 2 +- testdata/source/amazon/de_DE/974-6135682-9358749.json | 2 +- testdata/source/amazon/de_DE/D22-9220967-2566135.json | 2 +- testdata/source/amazon/de_DE/D60-9825125-4795642.json | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/beancount_import/source/amazon.py b/beancount_import/source/amazon.py index 968f5e54..35ec6c99 100644 --- a/beancount_import/source/amazon.py +++ b/beancount_import/source/amazon.py @@ -394,7 +394,7 @@ def make_amazon_transaction( (INVOICE_DESCRIPTION, adjustment.description), ]), )) - if invoice.tax is not None and invoice.tax.number != ZERO: + if len(invoice.tax)>0 and invoice.tax.number != ZERO: txn.postings.append( Posting( account=unknown_account_name, diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index fcd62e95..b34a4f8f 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -703,7 +703,7 @@ def resolve_posttax_adjustments(): if locale.tax_included_in_price: # tax is already inlcuded in item prices # do not add additional transaction for taxes - tax = None + tax = [] logger.debug('consistency check grand total...') payments_total_adjustment = reduce_amounts(payments_total_adjustments) @@ -971,7 +971,7 @@ def get_amounts_in_text(pattern_map): credit_card_transactions=credit_card_transactions, pretax_adjustments=[], posttax_adjustments=output_fields['posttax_adjustments'], - tax=None, + tax=[], errors=[]) diff --git a/testdata/source/amazon/D56-5204779-4181560.json b/testdata/source/amazon/D56-5204779-4181560.json index 09aac4e8..35b62d78 100644 --- a/testdata/source/amazon/D56-5204779-4181560.json +++ b/testdata/source/amazon/D56-5204779-4181560.json @@ -54,7 +54,7 @@ } ], "pretax_adjustments": [], - "tax": null, + "tax": [], "posttax_adjustments": [], "errors": [] } diff --git a/testdata/source/amazon/de_DE/256-0244967-2403944.json b/testdata/source/amazon/de_DE/256-0244967-2403944.json index eaaa047d..611b9e79 100644 --- a/testdata/source/amazon/de_DE/256-0244967-2403944.json +++ b/testdata/source/amazon/de_DE/256-0244967-2403944.json @@ -51,7 +51,7 @@ } } ], - "tax": null, + "tax": [], "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/393-2608279-9292916.json b/testdata/source/amazon/de_DE/393-2608279-9292916.json index 3872b61b..d5a1d966 100644 --- a/testdata/source/amazon/de_DE/393-2608279-9292916.json +++ b/testdata/source/amazon/de_DE/393-2608279-9292916.json @@ -65,7 +65,7 @@ } } ], - "tax": null, + "tax": [], "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/898-5185906-0096901.json b/testdata/source/amazon/de_DE/898-5185906-0096901.json index 26bd89f6..232d61ea 100644 --- a/testdata/source/amazon/de_DE/898-5185906-0096901.json +++ b/testdata/source/amazon/de_DE/898-5185906-0096901.json @@ -58,7 +58,7 @@ } } ], - "tax": null, + "tax": [], "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/974-6135682-9358749.json b/testdata/source/amazon/de_DE/974-6135682-9358749.json index 23e63ce1..190107f0 100644 --- a/testdata/source/amazon/de_DE/974-6135682-9358749.json +++ b/testdata/source/amazon/de_DE/974-6135682-9358749.json @@ -125,7 +125,7 @@ } } ], - "tax": null, + "tax": [], "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/D22-9220967-2566135.json b/testdata/source/amazon/de_DE/D22-9220967-2566135.json index 5c7a6da1..8c3ea1b4 100644 --- a/testdata/source/amazon/de_DE/D22-9220967-2566135.json +++ b/testdata/source/amazon/de_DE/D22-9220967-2566135.json @@ -29,7 +29,7 @@ ], "credit_card_transactions": [], "pretax_adjustments": [], - "tax": null, + "tax": [], "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/D60-9825125-4795642.json b/testdata/source/amazon/de_DE/D60-9825125-4795642.json index 82072efa..1b57f6c3 100644 --- a/testdata/source/amazon/de_DE/D60-9825125-4795642.json +++ b/testdata/source/amazon/de_DE/D60-9825125-4795642.json @@ -29,7 +29,7 @@ ], "credit_card_transactions": [], "pretax_adjustments": [], - "tax": null, + "tax": [], "posttax_adjustments": [], "errors": [] } \ No newline at end of file From 186483881b140e59070baf345615e6299db09624 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 19 Dec 2021 19:54:29 +0100 Subject: [PATCH 10/42] de_DE correct adjustments: posttax instead of pretax --- beancount_import/source/amazon_invoice.py | 7 +++---- testdata/source/amazon/de_DE/393-2608279-9292916.json | 7 ++++--- testdata/source/amazon/de_DE/898-5185906-0096901.json | 7 ++++--- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index b34a4f8f..0e0d1492 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -169,15 +169,14 @@ def __init__(self): pretax_adjustment_fields_pattern=('(?:' + '|'.join([ 'Verpackung & Versand', # 'Free Shipping', 'Free delivery', 'Pantry delivery', - 'Gutschein eingelöst', # english version not removed yet - 'Geschenkgutschein\(e\)', # 'Promotion(?:s| Applied)', 'Lightning Deal', # 'Your Coupon Savings', '[0-9]+% off savings', # 'Subscribe & Save', '[0-9]+ Audible Credit Applied', # '.*[0-9]+% Off.*', 'Courtesy Credit', # 'Extra Savings', '(?:.*) Discount', 'Gift[ -]Wrap', ]) + ') *:'), - posttax_adjustment_fields_pattern=r'Gift Card Amount:|Rewards Points:|Tip [(]optional[)]:|Recycle Fee \$X', + # most adjustments in DE are posttax: + posttax_adjustment_fields_pattern='Gutschein eingelöst:|Geschenkgutschein\(e\):', # Payment Table & Credit Card Transactions grand_total=r'\n\s*(?:Gesamtsumme|Endsumme):\s+(.*)\n', # regular: Gesamtsumme, digital: Endsumme @@ -185,7 +184,7 @@ def __init__(self): credit_card_last_digits=r'^([^:]+) mit den Endziffern ([0-9]+):\s+([^:]+):$', payment_type=[ # only first matching regex is used! - r'\n\s*([^\s|][^|\n]*[^|\s])\s+\|\s+Die letzten (?:4 )?Ziffern:\s+([0-9]{3,4})\n', # 3 digits for Bankeinzug + r'\n\s*([^\s|][^|\n]*[^|\s])\s+\|\s+Die letzten (?:4 )?Ziffern:\s*([0-9]{3,4})', # 3 digits for Bankeinzug r'\n\s*(.+)\s+mit den Endziffern\s+([0-9]{4})\n' ], payment_information='^Zahlungsdaten$', diff --git a/testdata/source/amazon/de_DE/393-2608279-9292916.json b/testdata/source/amazon/de_DE/393-2608279-9292916.json index d5a1d966..91b59997 100644 --- a/testdata/source/amazon/de_DE/393-2608279-9292916.json +++ b/testdata/source/amazon/de_DE/393-2608279-9292916.json @@ -49,7 +49,10 @@ "number": "0.00", "currency": "EUR" } - }, + } + ], + "tax": [], + "posttax_adjustments": [ { "description": "Gutschein eingel\u00f6st", "amount": { @@ -65,7 +68,5 @@ } } ], - "tax": [], - "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/898-5185906-0096901.json b/testdata/source/amazon/de_DE/898-5185906-0096901.json index 232d61ea..da21a0fb 100644 --- a/testdata/source/amazon/de_DE/898-5185906-0096901.json +++ b/testdata/source/amazon/de_DE/898-5185906-0096901.json @@ -49,7 +49,10 @@ "number": "0.00", "currency": "EUR" } - }, + } + ], + "tax": [], + "posttax_adjustments": [ { "description": "Gutschein eingel\u00f6st", "amount": { @@ -58,7 +61,5 @@ } } ], - "tax": [], - "posttax_adjustments": [], "errors": [] } \ No newline at end of file From 17661bf72fdda81da7691017984fdfe620798ca7 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 19 Dec 2021 20:05:34 +0100 Subject: [PATCH 11/42] add ability to parse gift card orders correctly --- beancount_import/source/amazon_invoice.py | 231 ++++++++++++++++------ 1 file changed, 169 insertions(+), 62 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 0e0d1492..09e7b65c 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -55,6 +55,7 @@ class Locale_Data(): regular_estimated_tax: str regular_order_placed: str regular_order_id: str + gift_card: str # digital orders only digital_order: str @@ -130,6 +131,7 @@ def __init__(self) -> None: regular_estimated_tax = 'Estimated tax to be collected:', regular_order_placed=r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})', regular_order_id=r'.*Order ([0-9\-]+)', + gift_card='Gift Cards', # not confirmed yet! # digital orders only digital_order='Digital Order: (.*)', @@ -160,8 +162,8 @@ def __init__(self): tax_included_in_price=True, # no separate tax transactions # common fields regular and digital orders - items_ordered='Bestellte Artikel', - price='Preis', + items_ordered='Bestellte Artikel|Erhalten', + price='Preis|Betrag', currency='EUR', items_subtotal='Zwischensumme:', total_before_tax='Summe ohne MwSt.:', @@ -208,6 +210,7 @@ def __init__(self): regular_estimated_tax='Anzurechnende MwSt.:', regular_order_placed=r'(?:Getätigte Spar-Abo-Bestellung|Bestellung aufgegeben am):\s+(\d+\. [^\s]+ \d{4})', regular_order_id=r'.*Bestellung ([0-9\-]+)', + gift_card='Geschenkgutscheine', # digital orders only digital_order_cancelled='Order Canceled', @@ -382,6 +385,11 @@ def is_shipment_header_table(node): header_tables = soup.find_all(is_shipment_header_table) + if header_tables is []: + # e.g. if only gift cards in order + logger.debug('no shipment table found') + return [] + shipments = [] # type: List[Shipment] errors = [] # type: Errors @@ -406,8 +414,9 @@ def is_items_ordered_header(node): tds = node('td') if len(tds) < 2: return False - return (tds[0].text.strip() == locale.items_ordered and - tds[1].text.strip() == locale.price) + m1 = re.match(locale.items_ordered, tds[0].text.strip()) + m2 = re.match(locale.price, tds[1].text.strip()) + return(m1 is not None and m2 is not None) items_ordered_header = shipment_table.find(is_items_ordered_header) @@ -470,65 +479,162 @@ def is_items_ordered_header(node): price=price, )) - logger.debug('parsing shipment amounts...') - items_subtotal = locale.parse_amount( - get_field_in_table(shipment_table, locale.items_subtotal)) - - expected_items_subtotal = reduce_amounts( - beancount.core.amount.mul(x.price, D(x.quantity)) for x in items) - if (items_subtotal is not None and - expected_items_subtotal != items_subtotal): - errors.append( - 'expected items subtotal is %r, but parsed value is %r' % - (expected_items_subtotal, items_subtotal)) - - output_fields = dict() - output_fields['pretax_adjustments'] = get_adjustments_in_table( - shipment_table, locale.pretax_adjustment_fields_pattern, locale=locale) - output_fields['posttax_adjustments'] = get_adjustments_in_table( - shipment_table, locale.posttax_adjustment_fields_pattern, locale=locale) - pretax_parts = [items_subtotal or expected_items_subtotal] + [ - a.amount for a in output_fields['pretax_adjustments'] - ] - total_before_tax = locale.parse_amount( - get_field_in_table(shipment_table, locale.total_before_tax)) - expected_total_before_tax = reduce_amounts(pretax_parts) - if total_before_tax is None: - total_before_tax = expected_total_before_tax - elif expected_total_before_tax != total_before_tax: - errors.append( - 'expected total before tax is %s, but parsed value is %s' % - (expected_total_before_tax, total_before_tax)) - - sales_tax = get_adjustments_in_table(shipment_table, locale.shipment_sales_tax, locale=locale) - - posttax_parts = ( - [total_before_tax] + [a.amount for a in sales_tax] + - [a.amount for a in output_fields['posttax_adjustments']]) - total = locale.parse_amount( - get_field_in_table(shipment_table, locale.shipment_total)) - expected_total = reduce_amounts(posttax_parts) - if total is None: - total = expected_total - elif expected_total != total: - errors.append('expected total is %s, but parsed value is %s' % - (expected_total, total)) - - logger.debug('...finshed parsing shipment') - shipments.append( - Shipment( - shipped_date=shipped_date, - items=items, - items_subtotal=items_subtotal, - total_before_tax=total_before_tax, - tax=sales_tax, - total=total, - errors=errors, - **output_fields)) + shipments.append(parse_shipment_payments( + shipment_table, + items, + errors, + shipped_date=shipped_date, + locale=locale + )) + + return shipments + +def parse_gift_cards(soup, locale=Locale_en_EN()) -> List[Shipment]: + """ + Parses Gift Card Table Part of HTML document (1st Table) + """ + def is_gift_card_header_table(node): + if node.name != 'table': + return False + text = node.text.strip() + m = re.match(locale.gift_card, text) + if m is not None: + # check if a matching subtable exists + sub_table = node.find_all(is_gift_card_header_table) + if sub_table == []: + # only match if it is the innermost table + return True + return False + + header_tables = soup.find_all(is_gift_card_header_table) + + if header_tables is []: + # if no gift cards in order + logger.debug('no shipment table found') + return [] + + shipments = [] # type: List[Shipment] + errors = [] # type: Errors + + for header_table in header_tables: + + items = [] + + shipment_table = header_table.find_parent('table') + + logger.debug('parsing gift card items...') + def is_items_ordered_header(node): + if node.name != 'tr': + return False + tds = node('td') + if len(tds) < 2: + return False + m1 = re.match(locale.items_ordered, tds[0].text.strip()) + m2 = re.match(locale.price, tds[1].text.strip()) + return(m1 is not None and m2 is not None) + + items_ordered_header = shipment_table.find(is_items_ordered_header) + + item_rows = [items_ordered_header] + + for item_row in item_rows: + tds = item_row('td') + description_node = tds[0] + price_node = tds[1] + price = price_node.text.strip() + price = price.split('\n')[1] + + if price is None: + price = Amount(D(0), locale.currency) + else: + price = locale.parse_amount(price) + + m = re.search(r'^(?PGeschenkgutschein)[\w\s-]*:\s*(?P[\w@._-]*)$', description_node.text.strip(), re.MULTILINE|re.UNICODE) + + description = m.group('type').strip() + ' ' + m.group('sent_to').strip() + + items.append( + Item( + quantity=D(1), + description=description, + sold_by=None, + condition=None, + price=price, + )) + + shipments.append(parse_shipment_payments( + shipment_table, + items, + errors, + shipped_date=None, + locale=locale + )) return shipments +def parse_shipment_payments( + shipment_table, items, errors, + shipped_date=None, locale=Locale_en_EN()): + """ Parse payment information of single shipments and gift card orders. + """ + logger.debug('parsing shipment amounts...') + items_subtotal = locale.parse_amount( + get_field_in_table(shipment_table, locale.items_subtotal)) + + expected_items_subtotal = reduce_amounts( + beancount.core.amount.mul(x.price, D(x.quantity)) for x in items) + if (items_subtotal is not None and + expected_items_subtotal != items_subtotal): + errors.append( + 'expected items subtotal is %r, but parsed value is %r' % + (expected_items_subtotal, items_subtotal)) + + output_fields = dict() + output_fields['pretax_adjustments'] = get_adjustments_in_table( + shipment_table, locale.pretax_adjustment_fields_pattern, locale=locale) + print(output_fields['pretax_adjustments']) + output_fields['posttax_adjustments'] = get_adjustments_in_table( + shipment_table, locale.posttax_adjustment_fields_pattern, locale=locale) + pretax_parts = [items_subtotal or expected_items_subtotal] + [ + a.amount for a in output_fields['pretax_adjustments'] + ] + total_before_tax = locale.parse_amount( + get_field_in_table(shipment_table, locale.total_before_tax)) + expected_total_before_tax = reduce_amounts(pretax_parts) + if total_before_tax is None: + total_before_tax = expected_total_before_tax + elif expected_total_before_tax != total_before_tax: + errors.append( + 'expected total before tax is %s, but parsed value is %s' % + (expected_total_before_tax, total_before_tax)) + + sales_tax = get_adjustments_in_table(shipment_table, locale.shipment_sales_tax, locale=locale) + + posttax_parts = ( + [total_before_tax] + [a.amount for a in sales_tax] + + [a.amount for a in output_fields['posttax_adjustments']]) + total = locale.parse_amount( + get_field_in_table(shipment_table, locale.shipment_total)) + expected_total = reduce_amounts(posttax_parts) + if total is None: + total = expected_total + elif expected_total != total: + errors.append('expected total is %s, but parsed value is %s' % + (expected_total, total)) + + logger.debug('...finshed parsing shipment') + return Shipment( + shipped_date=shipped_date, + items=items, + items_subtotal=items_subtotal, + total_before_tax=total_before_tax, + tax=sales_tax, + total=total, + errors=errors, + **output_fields) + + def parse_credit_card_transactions_from_payments_table( payment_table, order_date: datetime.date, @@ -610,7 +716,7 @@ def parse_regular_order_invoice(path: str, locale=Locale_en_EN()) -> Order: with open(path, 'rb') as f: soup = bs4.BeautifulSoup(f.read(), 'lxml') logger.debug('parsing shipments...') - shipments = parse_shipments(soup, locale=locale) + shipments = parse_shipments(soup, locale=locale) + parse_gift_cards(soup, locale=locale) logger.debug('finished parsing shipments') logger.debug('parsing payment table...') payment_table_header = soup.find( @@ -816,8 +922,9 @@ def is_items_ordered_header(node): tds = node('td') if len(tds) < 2: return False - return (tds[0].text.strip() == locale.items_ordered and - tds[1].text.strip() == locale.price) + m1 = re.match(locale.items_ordered, tds[0].text.strip()) + m2 = re.match(locale.price, tds[1].text.strip()) + return(m1 is not None and m2 is not None) items_ordered_header = digital_order_table.find(is_items_ordered_header) From 7afeaac5aa66ee978d3d8e8b8b905329fb3dad19 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 19 Dec 2021 20:06:59 +0100 Subject: [PATCH 12/42] remove debugging logs --- beancount_import/source/amazon_invoice.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 0e0d1492..c50db7a4 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -913,8 +913,6 @@ def get_amounts_in_text(pattern_map): locale.pretax_adjustment_fields_pattern) pretax_parts = ([items_subtotal] + [a.amount for a in output_fields['pretax_adjustments']]) - logger.debug(pretax_parts) - logger.debug(total_before_tax) expected_total_before_tax = reduce_amounts(pretax_parts) if expected_total_before_tax != total_before_tax: errors.append('expected total before tax is %s, but parsed value is %s' @@ -926,8 +924,7 @@ def get_amounts_in_text(pattern_map): posttax_parts = ([total_before_tax] + [a.amount for a in tax] + [a.amount for a in output_fields['posttax_adjustments']]) expected_total = reduce_amounts(posttax_parts) - - logger.debug(total_for_this_order) + if expected_total != total_for_this_order: errors.append('expected total is %s, but parsed value is %s' % (expected_total, total_for_this_order)) From 0bbce162a52763de6554e528d981bebcf3a2dc71 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 19 Dec 2021 20:34:02 +0100 Subject: [PATCH 13/42] add some test data --- beancount_import/source/amazon_invoice.py | 2 +- .../source/amazon_invoice_test.py | 2 + .../amazon/de_DE/071-4816388-0694813.html | 227 ++++++++++++++++++ .../amazon/de_DE/071-4816388-0694813.json | 49 ++++ .../amazon/de_DE/075-2225405-7594823.html | 227 ++++++++++++++++++ .../amazon/de_DE/075-2225405-7594823.json | 49 ++++ 6 files changed, 555 insertions(+), 1 deletion(-) create mode 100644 testdata/source/amazon/de_DE/071-4816388-0694813.html create mode 100644 testdata/source/amazon/de_DE/071-4816388-0694813.json create mode 100644 testdata/source/amazon/de_DE/075-2225405-7594823.html create mode 100644 testdata/source/amazon/de_DE/075-2225405-7594823.json diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 73dbf640..62f988a4 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -162,7 +162,7 @@ def __init__(self): tax_included_in_price=True, # no separate tax transactions # common fields regular and digital orders - items_ordered='Bestellte Artikel|Erhalten', + items_ordered='Bestellte Artikel|Erhalten|Versendet', # Erhalten|Versendet for gift cards price='Preis|Betrag', currency='EUR', items_subtotal='Zwischensumme:', diff --git a/beancount_import/source/amazon_invoice_test.py b/beancount_import/source/amazon_invoice_test.py index 98326b82..41a45a41 100644 --- a/beancount_import/source/amazon_invoice_test.py +++ b/beancount_import/source/amazon_invoice_test.py @@ -38,6 +38,8 @@ def test_parsing_en_EN(name: str): '974-6135682-9358749', 'D22-9220967-2566135', 'D60-9825125-4795642' + '071-4816388-0694813', # gift card amazon + '075-2225405-7594823', # gift card spotify ]) def test_parsing_de_DE(name: str): testdata_dir_locale = os.path.join(testdata_dir, 'de_DE') diff --git a/testdata/source/amazon/de_DE/071-4816388-0694813.html b/testdata/source/amazon/de_DE/071-4816388-0694813.html new file mode 100644 index 00000000..0f3b8b6a --- /dev/null +++ b/testdata/source/amazon/de_DE/071-4816388-0694813.html @@ -0,0 +1,227 @@ + + + + + + +Amazon.de - Bestellung 071-4816388-0694813 + + + + +
    +
    +
    + Übersicht zur Bestellung #071-4816388-0694813 +
    +Bitte drucken Sie diese Seite aus und legen Sie sie zu Ihren Unterlagen. +

    + + + + +
    + + + + + + +
    + + Bestellung aufgegeben am: + + 12. August 2020 +
    +Bestellnummer: + 071-4816388-0694813 + +Bestellübersicht drucken | + Rechnung drucken +
    +Gesamtbestellwert: + EUR 50,00 +
    +
    + + + + +
    + + + + + + + +
    + + + + +
    +
    Geschenkgutscheine
    +
    +
    + + + + +
    + + + + +
    +   +
    + + + + + +
    + +Erhalten
    +Geschenkgutschein per E-Mail schicken an: johndoe@mail.com
    + - Von: removed
    + - Nachricht: +
    greetings message removed

    +
    +Betrag
    +EUR 50,00
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    + + + + + + + +
    + + + + +
    +
    Zahlungsdaten
    +
    +
    + + + + +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + +
    Zwischensumme:EUR 50,00
     -----
    Summe:EUR 50,00
     -----
    Gesamtsumme: EUR 50,00
    +
    +Zahlungsart: +
    + + + + + + + +Visa / Electron + | Die letzten Ziffern:1234
    +
    +
    +
    +
    +
    +

    Um den Status Ihrer Bestellung einzusehen, kehren Sie auf Bestellungsübersicht zurück.

    +

    Hinweis: Dies ist keine Rechnung.

    +
    + +
    + + + +
    + +Unsere AGB |  + + + + + + +Datenschutzerklärung |  + + + + + + +Impressum  © 6143-7307, Amazon.com, Inc. und Tochtergesellschaften +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/071-4816388-0694813.json b/testdata/source/amazon/de_DE/071-4816388-0694813.json new file mode 100644 index 00000000..48263f5b --- /dev/null +++ b/testdata/source/amazon/de_DE/071-4816388-0694813.json @@ -0,0 +1,49 @@ +{ + "order_id": "071-4816388-0694813", + "order_date": "2020-08-12", + "shipments": [ + { + "shipped_date": null, + "items": [ + { + "quantity": "1", + "description": "Geschenkgutschein johndoe@mail.com", + "sold_by": null, + "condition": null, + "price": { + "number": "50.00", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": { + "number": "50.00", + "currency": "EUR" + }, + "posttax_adjustments": [], + "tax": [], + "total": { + "number": "50.00", + "currency": "EUR" + }, + "errors": [] + } + ], + "credit_card_transactions": [ + { + "date": "2020-08-12", + "card_description": "Visa / Electron", + "card_ending_in": "1234", + "amount": { + "number": "50.00", + "currency": "EUR" + } + } + ], + "pretax_adjustments": [], + "tax": [], + "posttax_adjustments": [], + "errors": [] +} \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/075-2225405-7594823.html b/testdata/source/amazon/de_DE/075-2225405-7594823.html new file mode 100644 index 00000000..6013ef0a --- /dev/null +++ b/testdata/source/amazon/de_DE/075-2225405-7594823.html @@ -0,0 +1,227 @@ + + + + + + +Amazon.de - Bestellung 075-2225405-7594823 + + + + +
    +
    +
    + Übersicht zur Bestellung #075-2225405-7594823 +
    +Bitte drucken Sie diese Seite aus und legen Sie sie zu Ihren Unterlagen. +

    + + + + +
    + + + + + + +
    + + Bestellung aufgegeben am: + + 1. März 2020 +
    +Bestellnummer: + 075-2225405-7594823 + +Bestellübersicht drucken | + Rechnung drucken +
    +Gesamtbestellwert: + EUR 99,00 +
    +
    + + + + +
    + + + + + + + +
    + + + + +
    +
    Geschenkgutscheine
    +
    +
    + + + + +
    + + + + +
    +   +
    + + + + + +
    + +Versendet
    +Geschenkgutschein per E-Mail schicken an: johndoe@mail.com
    + - Von: removed
    + - Nachricht: +
    greetings message removed

    +
    +Betrag
    +EUR 99,00
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    + + + + + + + +
    + + + + +
    +
    Zahlungsdaten
    +
    +
    + + + + +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + +
    Zwischensumme:EUR 99,00
     -----
    Summe:EUR 99,00
     -----
    Gesamtsumme: EUR 99,00
    +
    +Zahlungsart: +
    + + + + + + + +Visa / Electron + | Die letzten Ziffern:1234
    +
    +
    +
    +
    +
    +

    Um den Status Ihrer Bestellung einzusehen, kehren Sie auf Bestellungsübersicht zurück.

    +

    Hinweis: Dies ist keine Rechnung.

    +
    + +
    + + + +
    + +Unsere AGB |  + + + + + + +Datenschutzerklärung |  + + + + + + +Impressum  © 2252-1593, Amazon.com, Inc. und Tochtergesellschaften +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/075-2225405-7594823.json b/testdata/source/amazon/de_DE/075-2225405-7594823.json new file mode 100644 index 00000000..4fa08e1b --- /dev/null +++ b/testdata/source/amazon/de_DE/075-2225405-7594823.json @@ -0,0 +1,49 @@ +{ + "order_id": "075-2225405-7594823", + "order_date": "2020-03-01", + "shipments": [ + { + "shipped_date": null, + "items": [ + { + "quantity": "1", + "description": "Geschenkgutschein johndoe@mail.com", + "sold_by": null, + "condition": null, + "price": { + "number": "99.00", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": { + "number": "99.00", + "currency": "EUR" + }, + "posttax_adjustments": [], + "tax": [], + "total": { + "number": "99.00", + "currency": "EUR" + }, + "errors": [] + } + ], + "credit_card_transactions": [ + { + "date": "2020-03-01", + "card_description": "Visa / Electron", + "card_ending_in": "1234", + "amount": { + "number": "99.00", + "currency": "EUR" + } + } + ], + "pretax_adjustments": [], + "tax": [], + "posttax_adjustments": [], + "errors": [] +} \ No newline at end of file From a56215995251b48f3bbba7ad28cd09eeb4beffab Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 19 Dec 2021 20:36:06 +0100 Subject: [PATCH 14/42] add test for direct debit --- .../source/amazon_invoice_test.py | 13 +- .../amazon/de_DE/399-5779972-5007935.html | 316 ++++++++++++++++++ .../amazon/de_DE/399-5779972-5007935.json | 57 ++++ 3 files changed, 380 insertions(+), 6 deletions(-) create mode 100644 testdata/source/amazon/de_DE/399-5779972-5007935.html create mode 100644 testdata/source/amazon/de_DE/399-5779972-5007935.json diff --git a/beancount_import/source/amazon_invoice_test.py b/beancount_import/source/amazon_invoice_test.py index 98326b82..1dfc3166 100644 --- a/beancount_import/source/amazon_invoice_test.py +++ b/beancount_import/source/amazon_invoice_test.py @@ -32,12 +32,13 @@ def test_parsing_en_EN(name: str): @pytest.mark.parametrize('name', [ - '256-0244967-2403944', - '393-2608279-9292916', - '898-5185906-0096901', - '974-6135682-9358749', - 'D22-9220967-2566135', - 'D60-9825125-4795642' + '256-0244967-2403944', # regular order + '393-2608279-9292916', # Spar-Abo, payed with gift card + '898-5185906-0096901', # Spar-Abo + '974-6135682-9358749', # several credit card transactions + 'D22-9220967-2566135', # digital order, audible subscription + 'D60-9825125-4795642', # digital order + '399-5779972-5007935', # Direct Debit (Bankeinzug) ]) def test_parsing_de_DE(name: str): testdata_dir_locale = os.path.join(testdata_dir, 'de_DE') diff --git a/testdata/source/amazon/de_DE/399-5779972-5007935.html b/testdata/source/amazon/de_DE/399-5779972-5007935.html new file mode 100644 index 00000000..c1c72f9a --- /dev/null +++ b/testdata/source/amazon/de_DE/399-5779972-5007935.html @@ -0,0 +1,316 @@ + + + + + + +Amazon.de - Bestellung 399-5779972-5007935 + + + + +
    +
    +
    + Übersicht zur Bestellung #399-5779972-5007935 +
    +Bitte drucken Sie diese Seite aus und legen Sie sie zu Ihren Unterlagen. +

    + + + + +
    + + + + +
    + + Bestellung aufgegeben am: + + 21. November 2021 +
    +Bestellnummer: + 399-5779972-5007935 +
    +Gesamtbestellwert: + EUR 16,99 +
    +
    + + + + +
    + + + + + + + + + + +
    + + + + +
    +
    + versandt am 22. November 2021 +
    +
    +
    + + + + +
    + + + + +
    +   +
    + + + + + + + + + +
    +Bestellte Artikel + +Preis +
    + + 1 + + Exemplar(e) von: + + tiptoi® Mein großes Wimmelbuch, Friese, Inka
    + + Verkauf durch: Amazon EU S.a.r.L. + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + Zustand: Neu
    +
    +
    +EUR 16,99
    +
    +
    +
    +
    + + + + + +
    + +Versandadresse + +
    + + + + + + + + + +
    +Versandart: + +
    +Premiumversand +
    +
    + +
    +
    +
    +
    +
    + + + + +
    + + + + + + + +
    + + + + +
    +
    Zahlungsdaten
    +
    +
    + + + + +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Zwischensumme:EUR 15,88
    Verpackung & Versand:EUR 0,00
     -----
    Summe ohne MwSt.:EUR 15,88
    Anzurechnende MwSt.:EUR 1,11
     -----
    Summe:EUR 16,99
     -----
    Gesamtsumme: EUR 16,99
    +
    +Zahlungsart: +
    + + + Bankeinzug + | Die letzten Ziffern: 600
    +
    +Rechnungsadresse: + + + + + + + + + +
    +
    +
    +
    +
    +

    Um den Status Ihrer Bestellung einzusehen, kehren Sie auf Bestellungsübersicht zurück.

    +

    Hinweis: Dies ist keine Rechnung.

    +
    + +
    + + + +
    + +Unsere AGB |  + + + + + + +Datenschutzerklärung |  + + + + + + +Impressum  © 6622-9426, Amazon.com, Inc. und Tochtergesellschaften +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/399-5779972-5007935.json b/testdata/source/amazon/de_DE/399-5779972-5007935.json new file mode 100644 index 00000000..4efacf86 --- /dev/null +++ b/testdata/source/amazon/de_DE/399-5779972-5007935.json @@ -0,0 +1,57 @@ +{ + "order_id": "399-5779972-5007935", + "order_date": "2021-11-21", + "shipments": [ + { + "shipped_date": "2021-11-22", + "items": [ + { + "quantity": "1", + "description": "tiptoi\u00ae Mein gro\u00dfes Wimmelbuch, Friese, Inka", + "sold_by": "Amazon EU S.a.r.L.", + "condition": "Neu", + "price": { + "number": "16.99", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": { + "number": "16.99", + "currency": "EUR" + }, + "posttax_adjustments": [], + "tax": [], + "total": { + "number": "16.99", + "currency": "EUR" + }, + "errors": [] + } + ], + "credit_card_transactions": [ + { + "date": "2021-11-21", + "card_description": "Bankeinzug", + "card_ending_in": "600", + "amount": { + "number": "16.99", + "currency": "EUR" + } + } + ], + "pretax_adjustments": [ + { + "description": "Verpackung & Versand", + "amount": { + "number": "0.00", + "currency": "EUR" + } + } + ], + "tax": [], + "posttax_adjustments": [], + "errors": [] +} \ No newline at end of file From 3d4a07700ac7fc9836f852f67d2c584134c8e3b1 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 19 Dec 2021 21:12:37 +0100 Subject: [PATCH 15/42] add amazon account charge up including test case --- beancount_import/source/amazon_invoice.py | 16 +- .../source/amazon_invoice_test.py | 1 + .../amazon/de_DE/447-6209054-6766419.html | 221 ++++++++++++++++++ .../amazon/de_DE/447-6209054-6766419.json | 49 ++++ 4 files changed, 284 insertions(+), 3 deletions(-) create mode 100644 testdata/source/amazon/de_DE/447-6209054-6766419.html create mode 100644 testdata/source/amazon/de_DE/447-6209054-6766419.json diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 62f988a4..74aeb817 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -56,6 +56,8 @@ class Locale_Data(): regular_order_placed: str regular_order_id: str gift_card: str + gift_card_to: str + gift_card_amazon_account: str # digital orders only digital_order: str @@ -132,6 +134,8 @@ def __init__(self) -> None: regular_order_placed=r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})', regular_order_id=r'.*Order ([0-9\-]+)', gift_card='Gift Cards', # not confirmed yet! + gift_card_to=r'^(?PGift Card)[\w\s-]*:\s*(?P[\w@._-]*)$', # guess, not confirmed yet! + gift_card_amazon_account=r'^[\w\s-]*(?PAmazon-Account)[\w\s-]*(?Pcharged up)[\w\s-]*$', # guess, not confirmed yet! # digital orders only digital_order='Digital Order: (.*)', @@ -162,7 +166,7 @@ def __init__(self): tax_included_in_price=True, # no separate tax transactions # common fields regular and digital orders - items_ordered='Bestellte Artikel|Erhalten|Versendet', # Erhalten|Versendet for gift cards + items_ordered='Bestellte Artikel|Erhalten|Versendet|Amazon-Konto erfolgreich aufgeladen', # Erhalten|Versendet for gift cards price='Preis|Betrag', currency='EUR', items_subtotal='Zwischensumme:', @@ -211,6 +215,8 @@ def __init__(self): regular_order_placed=r'(?:Getätigte Spar-Abo-Bestellung|Bestellung aufgegeben am):\s+(\d+\. [^\s]+ \d{4})', regular_order_id=r'.*Bestellung ([0-9\-]+)', gift_card='Geschenkgutscheine', + gift_card_to=r'^(?PGeschenkgutschein)[\w\s-]*:\s*(?P[\w@._-]*)$', + gift_card_amazon_account=r'^[\w\s-]*(?PAmazon-Konto)[\w\s-]*(?Paufgeladen)[\w\s-]*$', # digital orders only digital_order_cancelled='Order Canceled', @@ -549,8 +555,12 @@ def is_items_ordered_header(node): else: price = locale.parse_amount(price) - m = re.search(r'^(?PGeschenkgutschein)[\w\s-]*:\s*(?P[\w@._-]*)$', description_node.text.strip(), re.MULTILINE|re.UNICODE) - + m = re.search(locale.gift_card_to, description_node.text.strip(), re.MULTILINE|re.UNICODE) + print(m) + if m is None: + # check if Amazon account has been charged up + m = re.search(locale.gift_card_amazon_account, description_node.text.strip(), re.MULTILINE|re.UNICODE) + print(m) description = m.group('type').strip() + ' ' + m.group('sent_to').strip() items.append( diff --git a/beancount_import/source/amazon_invoice_test.py b/beancount_import/source/amazon_invoice_test.py index f55f7adf..3788e5b6 100644 --- a/beancount_import/source/amazon_invoice_test.py +++ b/beancount_import/source/amazon_invoice_test.py @@ -41,6 +41,7 @@ def test_parsing_en_EN(name: str): '399-5779972-5007935', # Direct Debit (Bankeinzug) '071-4816388-0694813', # gift card amazon '075-2225405-7594823', # gift card spotify + '447-6209054-6766419', # charge up Amazon account ]) def test_parsing_de_DE(name: str): testdata_dir_locale = os.path.join(testdata_dir, 'de_DE') diff --git a/testdata/source/amazon/de_DE/447-6209054-6766419.html b/testdata/source/amazon/de_DE/447-6209054-6766419.html new file mode 100644 index 00000000..f6186461 --- /dev/null +++ b/testdata/source/amazon/de_DE/447-6209054-6766419.html @@ -0,0 +1,221 @@ + + + + + + +Amazon.de - Bestellung 447-6209054-6766419 + + + + +
    +
    +
    + Übersicht zur Bestellung #447-6209054-6766419 +
    +Bitte drucken Sie diese Seite aus und legen Sie sie zu Ihren Unterlagen. +

    + + + + +
    + + + + + + +
    + + Bestellung aufgegeben am: + + 27. Juli 2017 +
    +Bestellnummer: + 447-6209054-6766419 + +Bestellübersicht drucken | + Rechnung drucken +
    +Gesamtbestellwert: + EUR 100,00 +
    +
    + + + + +
    + + + + + + + +
    + + + + +
    +
    Geschenkgutscheine
    +
    +
    + + + + +
    + + + + +
    +   +
    + + + + + +
    +Amazon-Konto erfolgreich aufgeladen
    +
    Ihr Amazon-Konto wurde erfolgreich aufgeladen. Ihr Saldo enthält nun das zusätzliche Guthaben.
    +
    +Betrag
    +EUR 100,00
    +
    +
    +
    +
    +
    + + + + +
    + + + + + + + +
    + + + + +
    +
    Zahlungsdaten
    +
    +
    + + + + +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + +
    Zwischensumme:EUR 100,00
     -----
    Summe:EUR 100,00
     -----
    Gesamtsumme: EUR 100,00
    +
    +Zahlungsart: +
    + + + + + + + +Visa / Electron + | Die letzten Ziffern:1234
    +
    +
    +
    +
    +
    +

    Um den Status Ihrer Bestellung einzusehen, kehren Sie auf Bestellungsübersicht zurück.

    +

    Hinweis: Dies ist keine Rechnung.

    +
    + +
    + + + +
    + +Unsere AGB |  + + + + + + +Datenschutzerklärung |  + + + + + + +Impressum  © 5286-9368, Amazon.com, Inc. und Tochtergesellschaften +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/447-6209054-6766419.json b/testdata/source/amazon/de_DE/447-6209054-6766419.json new file mode 100644 index 00000000..289c89b9 --- /dev/null +++ b/testdata/source/amazon/de_DE/447-6209054-6766419.json @@ -0,0 +1,49 @@ +{ + "order_id": "447-6209054-6766419", + "order_date": "2017-07-27", + "shipments": [ + { + "shipped_date": null, + "items": [ + { + "quantity": "1", + "description": "Amazon-Konto aufgeladen", + "sold_by": null, + "condition": null, + "price": { + "number": "100.00", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": { + "number": "100.00", + "currency": "EUR" + }, + "posttax_adjustments": [], + "tax": [], + "total": { + "number": "100.00", + "currency": "EUR" + }, + "errors": [] + } + ], + "credit_card_transactions": [ + { + "date": "2017-07-27", + "card_description": "Visa / Electron", + "card_ending_in": "1234", + "amount": { + "number": "100.00", + "currency": "EUR" + } + } + ], + "pretax_adjustments": [], + "tax": [], + "posttax_adjustments": [], + "errors": [] +} \ No newline at end of file From d3e701be3e4b9075e7bb983efd4d94063a8985e5 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 19 Dec 2021 21:13:53 +0100 Subject: [PATCH 16/42] remove debugging print statement --- beancount_import/source/amazon_invoice.py | 1 - 1 file changed, 1 deletion(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 74aeb817..b99eea7a 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -603,7 +603,6 @@ def parse_shipment_payments( output_fields = dict() output_fields['pretax_adjustments'] = get_adjustments_in_table( shipment_table, locale.pretax_adjustment_fields_pattern, locale=locale) - print(output_fields['pretax_adjustments']) output_fields['posttax_adjustments'] = get_adjustments_in_table( shipment_table, locale.posttax_adjustment_fields_pattern, locale=locale) pretax_parts = [items_subtotal or expected_items_subtotal] + [ From a641c9400a8f1514fb7eb1c7ec8a47b8874e3291 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Thu, 13 Jan 2022 14:28:00 +0100 Subject: [PATCH 17/42] make quantity parsing more consistent, add log message if parsing failed --- beancount_import/source/amazon_invoice.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index c50db7a4..fc794edb 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -429,19 +429,18 @@ def is_items_ordered_header(node): # 2.07 lb of: Pork Sausage Link Italian Mild Step 1 m = re.match(locale.shipment_quantity, description_node.text, re.UNICODE|re.DOTALL) - quantity = 1 + if m is not None: # Amazon will say you got, e.g. 2 broccoli crowns at $1.69/lb - but then this code multiplies the 2 by the price listed # on the invoice, which is the total price in this case (but the per-unit price in other cases) - so if there's a quantity # and a weight, ignore the quantity and treat it as 1 # alternately, capture the weight and the per-unit price and multiply out quantity = m.group("quantity") # ignore quantity for weight items - - if quantity is None: - #print("Unable to extract quantity, using 1: %s" % description_node.text) - quantity = D(1) else: - quantity = D(quantity) + quantity = 1 + logger.info("Unable to extract quantity, using 1: %s" % description_node.text) + + quantity = D(quantity) text = description_node.text.split(locale.shipment_of, 1)[1] From a460017c4ac7183d488c7077671d7e49eed6d539 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Thu, 13 Jan 2022 14:28:33 +0100 Subject: [PATCH 18/42] fix DE shipment_quantity_pattern --- beancount_import/source/amazon_invoice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index fc794edb..7e1290fd 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -197,7 +197,7 @@ def __init__(self): 'Not Yet Shipped', 'Shipping now' }, - shipment_quantity=r'^\s*(?:(?P[0-9]+)|(?P[0-9.]+\s+(?:lb|kg))|(?:(?P[0-9.]+) [(](?P[^)]+)[)]))\s+of:', + shipment_quantity=r'^\s*(?:(?P[0-9]+)|(?P[0-9.]+\s+(?:lb|kg))|(?:(?P[0-9.]+) [(](?P[^)]+)[)]))\s+Exemplar\(e\)\svon:', shipment_of='Exemplar(e) von:', shipment_sales_tax='Anzurechnende MwSt.:', # not sure (only old invoices) shipment_total='Gesamtsumme:', From d367cdbc6b1c3bd811482a9150a0bf915afcef8e Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Thu, 13 Jan 2022 22:05:13 +0100 Subject: [PATCH 19/42] fix shipment quantity algorithm --- beancount_import/source/amazon_invoice.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 7e1290fd..b5e16400 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -430,13 +430,15 @@ def is_items_ordered_header(node): m = re.match(locale.shipment_quantity, description_node.text, re.UNICODE|re.DOTALL) + quantity = None if m is not None: # Amazon will say you got, e.g. 2 broccoli crowns at $1.69/lb - but then this code multiplies the 2 by the price listed # on the invoice, which is the total price in this case (but the per-unit price in other cases) - so if there's a quantity # and a weight, ignore the quantity and treat it as 1 # alternately, capture the weight and the per-unit price and multiply out quantity = m.group("quantity") # ignore quantity for weight items - else: + + if quantity is None: quantity = 1 logger.info("Unable to extract quantity, using 1: %s" % description_node.text) From eda2adb9d3364b3a1295f8d1439ea3f5d3cb48ba Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 20 Feb 2022 14:20:17 +0100 Subject: [PATCH 20/42] make payee match chosen locale --- beancount_import/source/amazon.py | 6 ++++-- beancount_import/source/amazon_invoice.py | 3 +++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/beancount_import/source/amazon.py b/beancount_import/source/amazon.py index 35ec6c99..85927f52 100644 --- a/beancount_import/source/amazon.py +++ b/beancount_import/source/amazon.py @@ -307,6 +307,7 @@ def make_amazon_transaction( posttax_adjustment_accounts, credit_card_accounts, amazon_account: str, + payee='Amazon.com' ): txn = Transaction( date=invoice.order_date, @@ -314,7 +315,7 @@ def make_amazon_transaction( (ORDER_ID_KEY, invoice.order_id), (AMAZON_ACCOUNT_KEY, amazon_account), ]), - payee='Amazon.com', + payee=payee, narration='Order', flag=FLAG_OKAY, tags=EMPTY_SET, @@ -614,7 +615,8 @@ def prepare(self, journal: JournalEditor, results: SourceResults): invoice=invoice, posttax_adjustment_accounts=self.posttax_adjustment_accounts, amazon_account=self.amazon_account, - credit_card_accounts=credit_card_accounts) + credit_card_accounts=credit_card_accounts, + payee=self.locale.payee) results.add_pending_entry( ImportResult( date=transaction.date, diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index b5e16400..dbfcfcb9 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -24,6 +24,7 @@ class Locale_Data(): LOCALE: str tax_included_in_price: bool + payee: str # common fields regular and digital orders items_ordered: str @@ -74,6 +75,7 @@ def __init__(self) -> None: super().__init__( LOCALE='en_EN', tax_included_in_price=False, + payee='Amazon.com', # common fields regular and digital orders items_ordered='Items Ordered', # shipment + digital @@ -158,6 +160,7 @@ def __init__(self): super().__init__( LOCALE='de_DE', tax_included_in_price=True, # no separate tax transactions + payee='Amazon.de', # common fields regular and digital orders items_ordered='Bestellte Artikel', From 98f15f07033ea94614bb058e05051f03faccd4e3 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 20 Feb 2022 14:23:23 +0100 Subject: [PATCH 21/42] add nonshipped header translation --- beancount_import/source/amazon_invoice.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index dbfcfcb9..239af448 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -194,9 +194,10 @@ def __init__(self): # regular orders only shipment_shipped_pattern='^versandt am ([^\\n]+)$', - shipment_nonshipped_headers={ # Translations missing + shipment_nonshipped_headers={ + 'Versand wird vorbereitet', + # Translations missing 'Service completed', - 'Preparing for Shipment', 'Not Yet Shipped', 'Shipping now' }, From df350d2c155414bdb5352e3e59a91ecc173096d6 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Mon, 21 Feb 2022 08:40:27 +0100 Subject: [PATCH 22/42] add nonshipped header translation --- beancount_import/source/amazon_invoice.py | 1 + 1 file changed, 1 insertion(+) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 239af448..365ecca9 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -196,6 +196,7 @@ def __init__(self): shipment_shipped_pattern='^versandt am ([^\\n]+)$', shipment_nonshipped_headers={ 'Versand wird vorbereitet', + 'Versand in Kürze', # Translations missing 'Service completed', 'Not Yet Shipped', From c179a4881d0fc8d3f804ed2c2475755a0d5e59a7 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Thu, 24 Mar 2022 16:45:39 +0100 Subject: [PATCH 23/42] fix typing, remove debug print statements --- beancount_import/source/amazon_invoice.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index b99eea7a..7a642705 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -269,7 +269,7 @@ def parse_date(date_str) -> datetime.date: Item = NamedTuple('Item', [ ('quantity', Decimal), ('description', str), - ('sold_by', str), + ('sold_by', Optional[str]), ('condition', Optional[str]), ('price', Amount), ]) @@ -556,12 +556,15 @@ def is_items_ordered_header(node): price = locale.parse_amount(price) m = re.search(locale.gift_card_to, description_node.text.strip(), re.MULTILINE|re.UNICODE) - print(m) if m is None: + # if no match is found # check if Amazon account has been charged up m = re.search(locale.gift_card_amazon_account, description_node.text.strip(), re.MULTILINE|re.UNICODE) - print(m) - description = m.group('type').strip() + ' ' + m.group('sent_to').strip() + if m is None: + errors.append('Failed to extract item description') + description='' + else: + description = m.group('type').strip() + ' ' + m.group('sent_to').strip() items.append( Item( From 1529e0b82b7440b32584577737899e8d402a567b Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Thu, 24 Mar 2022 18:57:25 +0100 Subject: [PATCH 24/42] add test cases for nonshipped headers --- .../source/amazon_invoice_test.py | 2 + .../amazon/de_DE/142-4912939-2196263.html | 317 ++++++++++++++++++ .../amazon/de_DE/142-4912939-2196263.json | 57 ++++ .../amazon/de_DE/588-8509154-9761865.html | 317 ++++++++++++++++++ .../amazon/de_DE/588-8509154-9761865.json | 57 ++++ 5 files changed, 750 insertions(+) create mode 100644 testdata/source/amazon/de_DE/142-4912939-2196263.html create mode 100644 testdata/source/amazon/de_DE/142-4912939-2196263.json create mode 100644 testdata/source/amazon/de_DE/588-8509154-9761865.html create mode 100644 testdata/source/amazon/de_DE/588-8509154-9761865.json diff --git a/beancount_import/source/amazon_invoice_test.py b/beancount_import/source/amazon_invoice_test.py index 1dfc3166..e9f445e9 100644 --- a/beancount_import/source/amazon_invoice_test.py +++ b/beancount_import/source/amazon_invoice_test.py @@ -39,6 +39,8 @@ def test_parsing_en_EN(name: str): 'D22-9220967-2566135', # digital order, audible subscription 'D60-9825125-4795642', # digital order '399-5779972-5007935', # Direct Debit (Bankeinzug) + '588-8509154-9761865', # preparing shipment + '142-4912939-2196263', # shipping soon ]) def test_parsing_de_DE(name: str): testdata_dir_locale = os.path.join(testdata_dir, 'de_DE') diff --git a/testdata/source/amazon/de_DE/142-4912939-2196263.html b/testdata/source/amazon/de_DE/142-4912939-2196263.html new file mode 100644 index 00000000..b0c37cc3 --- /dev/null +++ b/testdata/source/amazon/de_DE/142-4912939-2196263.html @@ -0,0 +1,317 @@ + + + + + + +Amazon.de - Bestellung 142-4912939-2196263 + + + + +
    +
    +
    + Übersicht zur Bestellung #142-4912939-2196263 +
    +Bitte drucken Sie diese Seite aus und legen Sie sie zu Ihren Unterlagen. +

    + + + + +
    + + + + +
    + + Bestellung aufgegeben am: + + 19. Februar 2022 +
    +Bestellnummer: + 142-4912939-2196263 +
    +Gesamtbestellwert: + EUR 15,99 +
    +
    + + + + +
    + + + + + + + + + + +
    + + + + +
    +
    + Versand in Kürze +
    +
    +
    + + + + +
    + + + + +
    +   +
    + + + + + + + + + +
    +Bestellte Artikel + +Preis +
    + + 1 + + Exemplar(e) von: + + CS Labs Wärmeleitpaste & Pad Ersatz-Set, K5-PRO K4-PRO kompatibel mit iPhone, Mac PS4 PS3 Xbox Asus Dell usw.
    + + Verkauf durch: WWW.COMPUTER-SYSTEMS.GR (Mitgliedsprofil) + + + + + + + + + + + + + + + + + + + + + + + + | Haben Sie eine Frage zum Produkt? Frage an den Verkäufer stellen +
    +
    + + Zustand: Neu
    +
    +
    +EUR 15,99
    +
    +
    +
    +
    + + + + + +
    + +Versandadresse + +
    + + + + + + + + + +
    +Versandart: + +
    +Premiumversand +
    +
    + +
    +
    +
    +
    +
    + + + + +
    + + + + + + + +
    + + + + +
    +
    Zahlungsdaten
    +
    +
    + + + + +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Zwischensumme:EUR 13,44
    Verpackung & Versand:EUR 0,00
     -----
    Summe ohne MwSt.:EUR 13,44
    Anzurechnende MwSt.:EUR 2,55
     -----
    Summe:EUR 15,99
     -----
    Gesamtsumme: EUR 15,99
    +
    +Zahlungsart: +
    + + + Bankeinzug + | Die letzten Ziffern: 600
    +
    +Rechnungsadresse: + + + + + + + + + +
    +
    +
    +
    +
    +

    Um den Status Ihrer Bestellung einzusehen, kehren Sie auf Bestellungsübersicht zurück.

    +

    Hinweis: Dies ist keine Rechnung.

    +
    + +
    + + + +
    + +Unsere AGB |  + + + + + + +Datenschutzerklärung |  + + + + + + +Impressum  © 6029-8252, Amazon.com, Inc. und Tochtergesellschaften +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/142-4912939-2196263.json b/testdata/source/amazon/de_DE/142-4912939-2196263.json new file mode 100644 index 00000000..fbc13d44 --- /dev/null +++ b/testdata/source/amazon/de_DE/142-4912939-2196263.json @@ -0,0 +1,57 @@ +{ + "order_id": "142-4912939-2196263", + "order_date": "2022-02-19", + "shipments": [ + { + "shipped_date": null, + "items": [ + { + "quantity": "1", + "description": "CS Labs W\u00e4rmeleitpaste & Pad Ersatz-Set, K5-PRO K4-PRO kompatibel mit iPhone, Mac PS4 PS3 Xbox Asus Dell usw.", + "sold_by": "WWW.COMPUTER-SYSTEMS.GR", + "condition": "Neu", + "price": { + "number": "15.99", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": { + "number": "15.99", + "currency": "EUR" + }, + "posttax_adjustments": [], + "tax": [], + "total": { + "number": "15.99", + "currency": "EUR" + }, + "errors": [] + } + ], + "credit_card_transactions": [ + { + "date": "2022-02-19", + "card_description": "Bankeinzug", + "card_ending_in": "600", + "amount": { + "number": "15.99", + "currency": "EUR" + } + } + ], + "pretax_adjustments": [ + { + "description": "Verpackung & Versand", + "amount": { + "number": "0.00", + "currency": "EUR" + } + } + ], + "tax": [], + "posttax_adjustments": [], + "errors": [] +} \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/588-8509154-9761865.html b/testdata/source/amazon/de_DE/588-8509154-9761865.html new file mode 100644 index 00000000..70a645a2 --- /dev/null +++ b/testdata/source/amazon/de_DE/588-8509154-9761865.html @@ -0,0 +1,317 @@ + + + + + + +Amazon.de - Bestellung 588-8509154-9761865 + + + + +
    +
    +
    + Übersicht zur Bestellung #588-8509154-9761865 +
    +Bitte drucken Sie diese Seite aus und legen Sie sie zu Ihren Unterlagen. +

    + + + + +
    + + + + +
    + + Bestellung aufgegeben am: + + 19. Februar 2022 +
    +Bestellnummer: + 588-8509154-9761865 +
    +Gesamtbestellwert: + EUR 15,99 +
    +
    + + + + +
    + + + + + + + + + + +
    + + + + +
    +
    + Versand wird vorbereitet +
    +
    +
    + + + + +
    + + + + +
    +   +
    + + + + + + + + + +
    +Bestellte Artikel + +Preis +
    + + 1 + + Exemplar(e) von: + + CS Labs Wärmeleitpaste & Pad Ersatz-Set, K5-PRO K4-PRO kompatibel mit iPhone, Mac PS4 PS3 Xbox Asus Dell usw.
    + + Verkauf durch: WWW.COMPUTER-SYSTEMS.GR (Mitgliedsprofil) + + + + + + + + + + + + + + + + + + + + + + + + | Haben Sie eine Frage zum Produkt? Frage an den Verkäufer stellen +
    +
    + + Zustand: Neu
    +
    +
    +EUR 15,99
    +
    +
    +
    +
    + + + + + +
    + +Versandadresse + +
    + + + + + + + + + +
    +Versandart: + +
    +Premiumversand +
    +
    + +
    +
    +
    +
    +
    + + + + +
    + + + + + + + +
    + + + + +
    +
    Zahlungsdaten
    +
    +
    + + + + +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Zwischensumme:EUR 13,44
    Verpackung & Versand:EUR 0,00
     -----
    Summe ohne MwSt.:EUR 13,44
    Anzurechnende MwSt.:EUR 2,55
     -----
    Summe:EUR 15,99
     -----
    Gesamtsumme: EUR 15,99
    +
    +Zahlungsart: +
    + + + Bankeinzug + | Die letzten Ziffern: 600
    +
    +Rechnungsadresse: + + + + + + + + + +
    +
    +
    +
    +
    +

    Um den Status Ihrer Bestellung einzusehen, kehren Sie auf Bestellungsübersicht zurück.

    +

    Hinweis: Dies ist keine Rechnung.

    +
    + +
    + + + +
    + +Unsere AGB |  + + + + + + +Datenschutzerklärung |  + + + + + + +Impressum  © 6257-8850, Amazon.com, Inc. und Tochtergesellschaften +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/588-8509154-9761865.json b/testdata/source/amazon/de_DE/588-8509154-9761865.json new file mode 100644 index 00000000..4fe63119 --- /dev/null +++ b/testdata/source/amazon/de_DE/588-8509154-9761865.json @@ -0,0 +1,57 @@ +{ + "order_id": "588-8509154-9761865", + "order_date": "2022-02-19", + "shipments": [ + { + "shipped_date": null, + "items": [ + { + "quantity": "1", + "description": "CS Labs W\u00e4rmeleitpaste & Pad Ersatz-Set, K5-PRO K4-PRO kompatibel mit iPhone, Mac PS4 PS3 Xbox Asus Dell usw.", + "sold_by": "WWW.COMPUTER-SYSTEMS.GR", + "condition": "Neu", + "price": { + "number": "15.99", + "currency": "EUR" + } + } + ], + "items_subtotal": null, + "pretax_adjustments": [], + "total_before_tax": { + "number": "15.99", + "currency": "EUR" + }, + "posttax_adjustments": [], + "tax": [], + "total": { + "number": "15.99", + "currency": "EUR" + }, + "errors": [] + } + ], + "credit_card_transactions": [ + { + "date": "2022-02-19", + "card_description": "Bankeinzug", + "card_ending_in": "600", + "amount": { + "number": "15.99", + "currency": "EUR" + } + } + ], + "pretax_adjustments": [ + { + "description": "Verpackung & Versand", + "amount": { + "number": "0.00", + "currency": "EUR" + } + } + ], + "tax": [], + "posttax_adjustments": [], + "errors": [] +} \ No newline at end of file From 8b5a87843e0d8f824765a7a5d2780ce1ddb67fc8 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Thu, 24 Mar 2022 16:50:26 +0100 Subject: [PATCH 25/42] correct locale name en_EN to en_US --- beancount_import/source/amazon.py | 6 +++--- beancount_import/source/amazon_invoice.py | 22 +++++++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/beancount_import/source/amazon.py b/beancount_import/source/amazon.py index 85927f52..703420ed 100644 --- a/beancount_import/source/amazon.py +++ b/beancount_import/source/amazon.py @@ -41,7 +41,7 @@ 'Gift Card Amount': 'Assets:Gift-Cards:Amazon', 'Rewards Points': 'Income:Amazon:Cashback', }, - locale='en_EN' # optional, defaults to 'en_EN' + locale='en_US' # optional, defaults to 'en_US' ) The `amazon_account` key must be specified, and should be set to the email @@ -56,7 +56,7 @@ prediction will likely handle them. The `locale` sets country/language specific settings. -Currently, `en_EN` and `de_DE` are available. +Currently, `en_US` and `de_DE` are available. Specifying credit cards ======================= @@ -547,7 +547,7 @@ def __init__(self, posttax_adjustment_accounts: Dict[str, str] = {}, pickle_dir: str = None, earliest_date: datetime.date = None, - locale='en_EN', + locale='en_US', **kwargs) -> None: super().__init__(**kwargs) self.directory = directory diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 365ecca9..46ae7497 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -68,12 +68,12 @@ class Locale_Data(): digital_payment_information: str -class Locale_en_EN(Locale_Data): +class Locale_en_US(Locale_Data): """Language and region specific settings for parsing amazon.com invoices """ def __init__(self) -> None: super().__init__( - LOCALE='en_EN', + LOCALE='en_US', tax_included_in_price=False, payee='Amazon.com', @@ -255,7 +255,7 @@ def parse_date(date_str) -> datetime.date: LOCALES_type = Dict[str, Any] -LOCALES: LOCALES_type = {'en_EN': Locale_en_EN, 'de_DE': Locale_de_DE} +LOCALES: LOCALES_type = {'en_US': Locale_en_US, 'de_DE': Locale_de_DE} Errors = List[str] Adjustment = NamedTuple('Adjustment', [ @@ -354,7 +354,7 @@ def predicate(node): return results -def get_adjustments_in_table(table, pattern, assumed_currency=None, locale=Locale_en_EN()): +def get_adjustments_in_table(table, pattern, assumed_currency=None, locale=Locale_en_US()): adjustments = [] for label, amount_str in get_field_in_table( table, pattern, allow_multiple=True, return_label=True): @@ -374,7 +374,7 @@ def reduce_adjustments(adjustments: List[Adjustment]) -> List[Adjustment]: ] -def parse_shipments(soup, locale=Locale_en_EN()) -> List[Shipment]: +def parse_shipments(soup, locale=Locale_en_US()) -> List[Shipment]: """ Parses Shipment Table Part of HTML document (1st Table) """ @@ -538,7 +538,7 @@ def is_items_ordered_header(node): def parse_credit_card_transactions_from_payments_table( payment_table, order_date: datetime.date, - locale=Locale_en_EN()) -> List[CreditCardTransaction]: + locale=Locale_en_US()) -> List[CreditCardTransaction]: """ Parse payment information from payments table. Only type and last digits are given, no amount (assuming grand total). Other payment methods than credit card are possible: @@ -568,7 +568,7 @@ def parse_credit_card_transactions_from_payments_table( return credit_card_transactions -def parse_credit_card_transactions(soup, locale=Locale_en_EN()) -> List[CreditCardTransaction]: +def parse_credit_card_transactions(soup, locale=Locale_en_US()) -> List[CreditCardTransaction]: """ Parse Credit Card Transactions from bottom sub-table of payments table. Transactions are listed with type, 4 digits, transaction date and amount. """ @@ -601,7 +601,7 @@ def is_header_node(node): return transactions -def parse_invoice(path: str, locale=Locale_en_EN()) -> Optional[Order]: +def parse_invoice(path: str, locale=Locale_en_US()) -> Optional[Order]: """ 1st method to call, distinguish between regular and digital invoice. """ if os.path.basename(path).startswith('D'): @@ -611,7 +611,7 @@ def parse_invoice(path: str, locale=Locale_en_EN()) -> Optional[Order]: return parse_regular_order_invoice(path, locale=locale) -def parse_regular_order_invoice(path: str, locale=Locale_en_EN()) -> Order: +def parse_regular_order_invoice(path: str, locale=Locale_en_US()) -> Order: errors = [] with open(path, 'rb') as f: soup = bs4.BeautifulSoup(f.read(), 'lxml') @@ -783,7 +783,7 @@ def get_text_lines(parent_node): return text_lines -def parse_digital_order_invoice(path: str, locale=Locale_en_EN()) -> Optional[Order]: +def parse_digital_order_invoice(path: str, locale=Locale_en_US()) -> Optional[Order]: errors = [] with open(path, 'rb') as f: soup = bs4.BeautifulSoup(f.read(), 'lxml') @@ -989,7 +989,7 @@ def main(): action='store_true', help='Output in JSON format.') ap.add_argument( - '--locale', default='en_EN', help='Local Amazon settings, defaults to EN') + '--locale', default='en_US', help='Local Amazon settings, defaults to en_US') ap.add_argument('paths', nargs='*') args = ap.parse_args() From e8d1e834d647d389d572959bafac8e32401335a9 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Thu, 24 Mar 2022 17:59:07 +0100 Subject: [PATCH 26/42] add Locale base class --- beancount_import/source/amazon_invoice.py | 301 +++++++++++----------- 1 file changed, 155 insertions(+), 146 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 46ae7497..c7315ea4 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -8,6 +8,7 @@ import functools import datetime import logging +from abc import ABC, abstractmethod import bs4 import dateutil.parser @@ -20,8 +21,7 @@ logger = logging.getLogger('amazon_invoice') -@dataclasses.dataclass -class Locale_Data(): +class Locale_Data(ABC): LOCALE: str tax_included_in_price: bool payee: str @@ -67,82 +67,92 @@ class Locale_Data(): digital_order_id: str digital_payment_information: str + @staticmethod + @abstractmethod + def parse_amount(amount, assumed_currency=None) -> Amount: + raise NotImplementedError + + @staticmethod + @abstractmethod + def parse_date(date_str) -> datetime.date: + raise NotImplementedError + class Locale_en_US(Locale_Data): """Language and region specific settings for parsing amazon.com invoices """ - def __init__(self) -> None: - super().__init__( - LOCALE='en_US', - tax_included_in_price=False, - payee='Amazon.com', - - # common fields regular and digital orders - items_ordered='Items Ordered', # shipment + digital - price='Price', # shipment + digital - currency='USD', # shipment only - items_subtotal=r'Item\(s\) Subtotal:', # shipment +digital - total_before_tax='Total Before Tax:', # shipment + digital - pretax_adjustment_fields_pattern=('(?:' + '|'.join([ - 'Shipping & Handling', # Verpackung & Versand: - 'Free Shipping', - 'Free delivery', - 'Pantry delivery', - 'Promotion(?:s| Applied)', # Gutschein eingelöst: - 'Lightning Deal', - 'Your Coupon Savings', - '[0-9]+% off savings', - 'Subscribe & Save', - '[0-9]+ Audible Credit Applied', - '.*[0-9]+% Off.*', - 'Courtesy Credit', - 'Extra Savings', - '(?:.*) Discount', - 'Gift[ -]Wrap', - ]) + ') *:'), - posttax_adjustment_fields_pattern=r'Gift Card Amount:|Rewards Points:|Tip [(]optional[)]:|Recycle Fee \$X', - - # Payment Table & Credit Card Transactions - grand_total=r'\n\s*Grand Total:\s+(.*)\n', - credit_card_transactions='Credit Card transactions', - credit_card_last_digits=r'^([^:]+) ending in ([0-9]+):\s+([^:]+):$', - payment_type=[ - # only first matching regex is used! - r'\n\s*([^\s|][^|\n]*[^|\s])\s+\|\s+Last (?:4 )?digits:\s+([0-9]{4})\n', - r'\n\s*(.+)\s+ending in\s+([0-9]{4})\n' - ], - payment_information='^Payment information$', - - # regular orders only - shipment_shipped_pattern='^Shipped on ([^\\n]+)$', - shipment_nonshipped_headers=[ - 'Service completed', - 'Preparing for Shipment', - 'Not Yet Shipped', - 'Shipping now' - ], - shipment_quantity=r'^\s*(?:(?P[0-9]+)|(?P[0-9.]+\s+(?:lb|kg))|(?:(?P[0-9.]+) [(](?P[^)]+)[)]))\s+of:', - shipment_of='of:', - shipment_sales_tax='Sales Tax:', - shipment_total='Total for This Shipment:', - shipment_seller_profile=' (seller profile)', - shipment_sold_by=r'(?P.*)\n\s*(?:Sold|Provided) by:? (?P[^\n]+)', - shipment_condition=r'\n.*\n\s*Condition: (?P[^\n]+)', - regular_total_order='Grand Total:', - regular_estimated_tax = 'Estimated tax to be collected:', - regular_order_placed=r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})', - regular_order_id=r'.*Order ([0-9\-]+)', - - # digital orders only - digital_order='Digital Order: (.*)', - digital_order_cancelled='Order Canceled', - digital_by='By', - digital_sold_by=r'Sold\s+By', - digital_tax_collected='Tax Collected:', - digital_total_order='Total for this Order:', - digital_order_id='^Amazon.com\\s+order number:\\s+(D[0-9-]+)$', - digital_payment_information='Payment Information' - ) + LOCALE='en_US' + tax_included_in_price=False + payee='Amazon.com' + + # common fields regular and digital orders + items_ordered='Items Ordered' # shipment + digital + price='Price' # shipment + digital + currency='USD' # shipment only + items_subtotal=r'Item\(s\) Subtotal:' # shipment +digital + total_before_tax='Total Before Tax:' # shipment + digital + pretax_adjustment_fields_pattern=('(?:' + '|'.join([ + 'Shipping & Handling', # Verpackung & Versand: + 'Free Shipping', + 'Free delivery', + 'Pantry delivery', + 'Promotion(?:s| Applied)', # Gutschein eingelöst: + 'Lightning Deal', + 'Your Coupon Savings', + '[0-9]+% off savings', + 'Subscribe & Save', + '[0-9]+ Audible Credit Applied', + '.*[0-9]+% Off.*', + 'Courtesy Credit', + 'Extra Savings', + '(?:.*) Discount', + 'Gift[ -]Wrap', + ]) + ') *:') + posttax_adjustment_fields_pattern=r'Gift Card Amount:|Rewards Points:|Tip [(]optional[)]:|Recycle Fee \$X' + + # Payment Table & Credit Card Transactions + grand_total=r'\n\s*Grand Total:\s+(.*)\n' + credit_card_transactions='Credit Card transactions' + credit_card_last_digits=r'^([^:]+) ending in ([0-9]+):\s+([^:]+):$' + payment_type=[ + # only first matching regex is used! + r'\n\s*([^\s|][^|\n]*[^|\s])\s+\|\s+Last (?:4 )?digits:\s+([0-9]{4})\n', + r'\n\s*(.+)\s+ending in\s+([0-9]{4})\n' + ] + payment_information='^Payment information$' + + # regular orders only + shipment_shipped_pattern='^Shipped on ([^\\n]+)$' + shipment_nonshipped_headers=[ + 'Service completed', + 'Preparing for Shipment', + 'Not Yet Shipped', + 'Shipping now' + ] + shipment_quantity=r'^\s*(?:(?P[0-9]+)|(?P[0-9.]+\s+(?:lb|kg))|(?:(?P[0-9.]+) [(](?P[^)]+)[)]))\s+of:' + shipment_of='of:' + shipment_sales_tax='Sales Tax:' + shipment_total='Total for This Shipment:' + shipment_seller_profile=' (seller profile)' + shipment_sold_by=r'(?P.*)\n\s*(?:Sold|Provided) by:? (?P[^\n]+)' + shipment_condition=r'\n.*\n\s*Condition: (?P[^\n]+)' + regular_total_order='Grand Total:' + regular_estimated_tax = 'Estimated tax to be collected:' + regular_order_placed=r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})' + regular_order_id=r'.*Order ([0-9\-]+)' + gift_card='Gift Cards' # not confirmed yet! + gift_card_to=r'^(?PGift Card)[\w\s-]*:\s*(?P[\w@._-]*)$' # guess, not confirmed yet! + gift_card_amazon_account=r'^[\w\s-]*(?PAmazon-Account)[\w\s-]*(?Pcharged up)[\w\s-]*$' # guess, not confirmed yet! + + # digital orders only + digital_order='Digital Order: (.*)' + digital_order_cancelled='Order Canceled' + digital_by='By' + digital_sold_by=r'Sold\s+By' + digital_tax_collected='Tax Collected:' + digital_total_order='Total for this Order:' + digital_order_id='^Amazon.com\\s+order number:\\s+(D[0-9-]+)$' + digital_payment_information='Payment Information' @staticmethod def parse_amount(amount, assumed_currency=None) -> Amount: @@ -156,74 +166,74 @@ def parse_date(date_str) -> datetime.date: class Locale_de_DE(Locale_Data): """Language and region specific settings for parsing amazon.de invoices """ - def __init__(self): - super().__init__( - LOCALE='de_DE', - tax_included_in_price=True, # no separate tax transactions - payee='Amazon.de', - - # common fields regular and digital orders - items_ordered='Bestellte Artikel', - price='Preis', - currency='EUR', - items_subtotal='Zwischensumme:', - total_before_tax='Summe ohne MwSt.:', - # most of translations still missing ... - pretax_adjustment_fields_pattern=('(?:' + '|'.join([ - 'Verpackung & Versand', - # 'Free Shipping', 'Free delivery', 'Pantry delivery', - # 'Promotion(?:s| Applied)', 'Lightning Deal', - # 'Your Coupon Savings', '[0-9]+% off savings', - # 'Subscribe & Save', '[0-9]+ Audible Credit Applied', - # '.*[0-9]+% Off.*', 'Courtesy Credit', - # 'Extra Savings', '(?:.*) Discount', 'Gift[ -]Wrap', - ]) + ') *:'), - # most adjustments in DE are posttax: - posttax_adjustment_fields_pattern='Gutschein eingelöst:|Geschenkgutschein\(e\):', - - # Payment Table & Credit Card Transactions - grand_total=r'\n\s*(?:Gesamtsumme|Endsumme):\s+(.*)\n', # regular: Gesamtsumme, digital: Endsumme - credit_card_transactions='Kreditkarten-Transaktionen', - credit_card_last_digits=r'^([^:]+) mit den Endziffern ([0-9]+):\s+([^:]+):$', - payment_type=[ - # only first matching regex is used! - r'\n\s*([^\s|][^|\n]*[^|\s])\s+\|\s+Die letzten (?:4 )?Ziffern:\s*([0-9]{3,4})', # 3 digits for Bankeinzug - r'\n\s*(.+)\s+mit den Endziffern\s+([0-9]{4})\n' - ], - payment_information='^Zahlungsdaten$', - - # regular orders only - shipment_shipped_pattern='^versandt am ([^\\n]+)$', - shipment_nonshipped_headers={ - 'Versand wird vorbereitet', - 'Versand in Kürze', - # Translations missing - 'Service completed', - 'Not Yet Shipped', - 'Shipping now' - }, - shipment_quantity=r'^\s*(?:(?P[0-9]+)|(?P[0-9.]+\s+(?:lb|kg))|(?:(?P[0-9.]+) [(](?P[^)]+)[)]))\s+Exemplar\(e\)\svon:', - shipment_of='Exemplar(e) von:', - shipment_sales_tax='Anzurechnende MwSt.:', # not sure (only old invoices) - shipment_total='Gesamtsumme:', - shipment_seller_profile=' (Mitgliedsprofil)', - shipment_sold_by=r'(?P.*)\n\s*(?:Verkauf) durch:? (?P[^\n]+)', - shipment_condition=r'\n.*\n\s*Zustand: (?P[^\n]+)', - regular_total_order='Gesamtsumme:', - regular_estimated_tax='Anzurechnende MwSt.:', - regular_order_placed=r'(?:Getätigte Spar-Abo-Bestellung|Bestellung aufgegeben am):\s+(\d+\. [^\s]+ \d{4})', - regular_order_id=r'.*Bestellung ([0-9\-]+)', - - # digital orders only - digital_order_cancelled='Order Canceled', - digital_order='Digitale Bestellung: (.*)', - digital_by='Von', - digital_sold_by=r'Verkauft von', - digital_tax_collected='MwSt:', - digital_total_order='Endsumme:', - digital_order_id='^Amazon.de\\s+Bestellnummer:\\s+(D[0-9-]+)$', - digital_payment_information='Zahlungsinformation' - ) + LOCALE='de_DE' + tax_included_in_price=True # no separate tax transactions + payee='Amazon.de' + + # common fields regular and digital orders + items_ordered='Bestellte Artikel|Erhalten|Versendet|Amazon-Konto erfolgreich aufgeladen' # Erhalten|Versendet for gift cards + price='Preis|Betrag' + currency='EUR' + items_subtotal='Zwischensumme:' + total_before_tax='Summe ohne MwSt.:' + # most of translations still missing ... + pretax_adjustment_fields_pattern=('(?:' + '|'.join([ + 'Verpackung & Versand', + # 'Free Shipping', 'Free delivery', 'Pantry delivery', + # 'Promotion(?:s| Applied)', 'Lightning Deal', + # 'Your Coupon Savings', '[0-9]+% off savings', + # 'Subscribe & Save', '[0-9]+ Audible Credit Applied', + # '.*[0-9]+% Off.*', 'Courtesy Credit', + # 'Extra Savings', '(?:.*) Discount', 'Gift[ -]Wrap', + ]) + ') *:') + # most adjustments in DE are posttax: + posttax_adjustment_fields_pattern='Gutschein eingelöst:|Geschenkgutschein\(e\):' + + # Payment Table & Credit Card Transactions + grand_total=r'\n\s*(?:Gesamtsumme|Endsumme):\s+(.*)\n' # regular: Gesamtsumme, digital: Endsumme + credit_card_transactions='Kreditkarten-Transaktionen' + credit_card_last_digits=r'^([^:]+) mit den Endziffern ([0-9]+):\s+([^:]+):$' + payment_type=[ + # only first matching regex is used! + r'\n\s*([^\s|][^|\n]*[^|\s])\s+\|\s+Die letzten (?:4 )?Ziffern:\s*([0-9]{3,4})', # 3 digits for Bankeinzug + r'\n\s*(.+)\s+mit den Endziffern\s+([0-9]{4})\n' + ] + payment_information='^Zahlungsdaten$' + + # regular orders only + shipment_shipped_pattern='^versandt am ([^\\n]+)$' + shipment_nonshipped_headers=[ + 'Versand wird vorbereitet', + 'Versand in Kürze', + # Translations missing + 'Service completed', + 'Not Yet Shipped', + 'Shipping now' + ] + shipment_quantity=r'^\s*(?:(?P[0-9]+)|(?P[0-9.]+\s+(?:lb|kg))|(?:(?P[0-9.]+) [(](?P[^)]+)[)]))\s+Exemplar\(e\)\svon:' + shipment_of='Exemplar(e) von:' + shipment_sales_tax='Anzurechnende MwSt.:' # not sure (only old invoices) + shipment_total='Gesamtsumme:' + shipment_seller_profile=' (Mitgliedsprofil)' + shipment_sold_by=r'(?P.*)\n\s*(?:Verkauf) durch:? (?P[^\n]+)' + shipment_condition=r'\n.*\n\s*Zustand: (?P[^\n]+)' + regular_total_order='Gesamtsumme:' + regular_estimated_tax='Anzurechnende MwSt.:' + regular_order_placed=r'(?:Getätigte Spar-Abo-Bestellung|Bestellung aufgegeben am):\s+(\d+\. [^\s]+ \d{4})' + regular_order_id=r'.*Bestellung ([0-9\-]+)' + gift_card='Geschenkgutscheine' + gift_card_to=r'^(?PGeschenkgutschein)[\w\s-]*:\s*(?P[\w@._-]*)$' + gift_card_amazon_account=r'^[\w\s-]*(?PAmazon-Konto)[\w\s-]*(?Paufgeladen)[\w\s-]*$' + + # digital orders only + digital_order_cancelled='Order Canceled' + digital_order='Digitale Bestellung: (.*)' + digital_by='Von' + digital_sold_by=r'Verkauft von' + digital_tax_collected='MwSt:' + digital_total_order='Endsumme:' + digital_order_id='^Amazon.de\\s+Bestellnummer:\\s+(D[0-9-]+)$' + digital_payment_information='Zahlungsinformation' @staticmethod def _format_number_str(value: str) -> str: @@ -254,8 +264,7 @@ def parse_date(date_str) -> datetime.date: return dateutil.parser.parse(date_str, parserinfo=Locale_de_DE._parserinfo(dayfirst=True)).date() -LOCALES_type = Dict[str, Any] -LOCALES: LOCALES_type = {'en_US': Locale_en_US, 'de_DE': Locale_de_DE} +LOCALES = {x.LOCALE: x for x in [Locale_en_US, Locale_de_DE]} Errors = List[str] Adjustment = NamedTuple('Adjustment', [ @@ -265,7 +274,7 @@ def parse_date(date_str) -> datetime.date: Item = NamedTuple('Item', [ ('quantity', Decimal), ('description', str), - ('sold_by', str), + ('sold_by', Optional[str]), ('condition', Optional[str]), ('price', Amount), ]) @@ -993,7 +1002,7 @@ def main(): ap.add_argument('paths', nargs='*') args = ap.parse_args() - locale = LOCALES[args.locale]() + locale = LOCALES[args.locale] results = [] for path in args.paths: try: From d5263ea60c2d0ca2104baa74db0e8ca8a8e9d6d3 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Thu, 24 Mar 2022 21:04:16 +0100 Subject: [PATCH 27/42] clean up imports, do not create locale instance --- beancount_import/source/amazon.py | 2 +- beancount_import/source/amazon_invoice.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/beancount_import/source/amazon.py b/beancount_import/source/amazon.py index 703420ed..e8f801f3 100644 --- a/beancount_import/source/amazon.py +++ b/beancount_import/source/amazon.py @@ -560,7 +560,7 @@ def __init__(self, self.pickler = AmazonPickler(pickle_dir) self.earliest_date = earliest_date - self.locale = LOCALES[locale]() + self.locale = LOCALES[locale] self.invoice_filenames = [] # type: List[Tuple[str, str]] for filename in os.listdir(self.directory): diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 8a597897..1c2caf40 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -1,14 +1,13 @@ """Parses an Amazon.com/.de regular or digital order details HTML file.""" -from typing import NamedTuple, Optional, List, Union, Iterable, Dict, Sequence, cast, Any -import dataclasses +from typing import NamedTuple, Optional, List, Union, Iterable, Dict, Sequence, cast +from abc import ABC, abstractmethod import collections import re import os import functools import datetime import logging -from abc import ABC, abstractmethod import bs4 import dateutil.parser @@ -400,6 +399,7 @@ def is_shipment_header_table(node): header_tables = soup.find_all(is_shipment_header_table) if header_tables is []: + # no shipment table # e.g. if only gift cards in order logger.debug('no shipment table found') return [] @@ -525,7 +525,7 @@ def is_gift_card_header_table(node): if header_tables is []: # if no gift cards in order - logger.debug('no shipment table found') + logger.debug('no gift card table found') return [] shipments = [] # type: List[Shipment] From ff21279bb4b52f912b13da5e14d81ddb6e989deb Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Fri, 25 Mar 2022 10:03:42 +0100 Subject: [PATCH 28/42] reduce excessive logging for amazon fresh invoices with irrelevant quantities --- beancount_import/source/amazon_invoice.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 1c2caf40..ff07f5f8 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -459,11 +459,16 @@ def is_items_ordered_header(node): # on the invoice, which is the total price in this case (but the per-unit price in other cases) - so if there's a quantity # and a weight, ignore the quantity and treat it as 1 # alternately, capture the weight and the per-unit price and multiply out - quantity = m.group("quantity") # ignore quantity for weight items - if quantity is None: + # 'quantity' group: integer, no weight units, no decimals + quantity = m.group("quantity") + # set silently to 1 if other regex groups match + if quantity is None: + quantity = 1 + else: + # regex did not match at all -> log warning quantity = 1 - logger.info("Unable to extract quantity, using 1: %s" % description_node.text) + logger.warning("Unable to extract quantity, using 1: %s" % description_node.text) quantity = D(quantity) From 8715e517f6d77ce50a66b8582baab659831a5884 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Fri, 25 Mar 2022 10:04:55 +0100 Subject: [PATCH 29/42] correct test name --- beancount_import/source/amazon_invoice_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beancount_import/source/amazon_invoice_test.py b/beancount_import/source/amazon_invoice_test.py index 92ab1e1d..f588d899 100644 --- a/beancount_import/source/amazon_invoice_test.py +++ b/beancount_import/source/amazon_invoice_test.py @@ -17,7 +17,7 @@ '166-7926740-5141621', 'D56-5204779-4181560', ]) -def test_parsing_en_EN(name: str): +def test_parsing_en_US(name: str): source_path = os.path.join(testdata_dir, name + '.html') invoice = amazon_invoice.parse_invoice(source_path) json_path = os.path.join(testdata_dir, name + '.json') From 7aac6adbf544a9974087b29a89042a16d6c88772 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Fri, 25 Mar 2022 10:32:42 +0100 Subject: [PATCH 30/42] add missing types --- beancount_import/source/amazon_invoice.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index ff07f5f8..61465a3a 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -417,7 +417,7 @@ def is_shipment_header_table(node): assert m is not None shipped_date = locale.parse_date(m.group(1)) - items = [] + items = [] # type: List[Item] shipment_table = header_table.find_parent('table') @@ -468,7 +468,7 @@ def is_items_ordered_header(node): else: # regex did not match at all -> log warning quantity = 1 - logger.warning("Unable to extract quantity, using 1: %s" % description_node.text) + errors.append("Unable to extract quantity, using 1: %s" % description_node.text) quantity = D(quantity) @@ -538,7 +538,7 @@ def is_gift_card_header_table(node): for header_table in header_tables: - items = [] + items = [] # type: List[Item] shipment_table = header_table.find_parent('table') @@ -601,8 +601,10 @@ def is_items_ordered_header(node): def parse_shipment_payments( - shipment_table, items, errors, - shipped_date=None, locale=Locale_en_US()): + shipment_table, + items, errors, + shipped_date=None, + locale=Locale_en_US()) -> Shipment: """ Parse payment information of single shipments and gift card orders. """ logger.debug('parsing shipment amounts...') @@ -738,7 +740,7 @@ def parse_invoice(path: str, locale=Locale_en_US()) -> Optional[Order]: def parse_regular_order_invoice(path: str, locale=Locale_en_US()) -> Order: - errors = [] + errors = [] # type: Errors with open(path, 'rb') as f: soup = bs4.BeautifulSoup(f.read(), 'lxml') logger.debug('parsing shipments...') @@ -955,7 +957,7 @@ def is_items_ordered_header(node): items_ordered_header = digital_order_table.find(is_items_ordered_header) item_rows = items_ordered_header.find_next_siblings('tr') - items = [] + items = [] # List[Item] other_fields_td = None From f042a9f1a9696cbd3e36d43fd848f03f917b5e66 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Fri, 25 Mar 2022 12:49:34 +0100 Subject: [PATCH 31/42] add docstring with hierarchy of functions --- beancount_import/source/amazon_invoice.py | 33 +++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 61465a3a..bfb09288 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -1,5 +1,34 @@ -"""Parses an Amazon.com/.de regular or digital order details HTML file.""" - +"""Parses an Amazon.com/.de regular or digital order details HTML file. + +Hierarchy of functions for parsing Amazon invoices: + +main(...) + | + + parse_invoice(...) + | | + | + parse_digital_order_invoice(...) + | | | + | | + parse_credit_card_transactions_from_payments_table(...) + | | +-> returns Order(..., shipments, ...) + | | + | + parse_regular_order_invoice(...) + | | + | + parse_shipments(...) + | | + parse_shipment_payments(...) + | | | +-> returns Shipment + | | +-> returns List[Shipment] + | | + | + parse_gift_cards(...) + | | + parse_shipment_payments(...) + | | | +-> returns Shipment + | | +-> returns List[Shipment] + | | + | + parse_credit_card_transactions(...) + | + parse_credit_card_transactions_from_payments_table(...) + | +-> returns Order(..., shipments, ...) + | + +-> returns List[Order] +""" from typing import NamedTuple, Optional, List, Union, Iterable, Dict, Sequence, cast from abc import ABC, abstractmethod import collections From 7338a7b375d70cf0dea5fcccebc50ed8b241a429 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sat, 26 Mar 2022 17:10:55 +0100 Subject: [PATCH 32/42] use static class instead of class instance as default arguments --- beancount_import/source/amazon_invoice.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index bfb09288..2482cd04 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -394,7 +394,7 @@ def predicate(node): return results -def get_adjustments_in_table(table, pattern, assumed_currency=None, locale=Locale_en_US()): +def get_adjustments_in_table(table, pattern, assumed_currency=None, locale=Locale_en_US): adjustments = [] for label, amount_str in get_field_in_table( table, pattern, allow_multiple=True, return_label=True): @@ -414,7 +414,7 @@ def reduce_adjustments(adjustments: List[Adjustment]) -> List[Adjustment]: ] -def parse_shipments(soup, locale=Locale_en_US()) -> List[Shipment]: +def parse_shipments(soup, locale=Locale_en_US) -> List[Shipment]: """ Parses Shipment Table Part of HTML document (1st Table) """ @@ -538,7 +538,7 @@ def is_items_ordered_header(node): return shipments -def parse_gift_cards(soup, locale=Locale_en_US()) -> List[Shipment]: +def parse_gift_cards(soup, locale=Locale_en_US) -> List[Shipment]: """ Parses Gift Card Table Part of HTML document (1st Table) """ @@ -633,7 +633,7 @@ def parse_shipment_payments( shipment_table, items, errors, shipped_date=None, - locale=Locale_en_US()) -> Shipment: + locale=Locale_en_US) -> Shipment: """ Parse payment information of single shipments and gift card orders. """ logger.debug('parsing shipment amounts...') @@ -695,7 +695,7 @@ def parse_shipment_payments( def parse_credit_card_transactions_from_payments_table( payment_table, order_date: datetime.date, - locale=Locale_en_US()) -> List[CreditCardTransaction]: + locale=Locale_en_US) -> List[CreditCardTransaction]: """ Parse payment information from payments table. Only type and last digits are given, no amount (assuming grand total). Other payment methods than credit card are possible: @@ -725,7 +725,7 @@ def parse_credit_card_transactions_from_payments_table( return credit_card_transactions -def parse_credit_card_transactions(soup, locale=Locale_en_US()) -> List[CreditCardTransaction]: +def parse_credit_card_transactions(soup, locale=Locale_en_US) -> List[CreditCardTransaction]: """ Parse Credit Card Transactions from bottom sub-table of payments table. Transactions are listed with type, 4 digits, transaction date and amount. """ @@ -758,7 +758,7 @@ def is_header_node(node): return transactions -def parse_invoice(path: str, locale=Locale_en_US()) -> Optional[Order]: +def parse_invoice(path: str, locale=Locale_en_US) -> Optional[Order]: """ 1st method to call, distinguish between regular and digital invoice. """ if os.path.basename(path).startswith('D'): @@ -768,7 +768,7 @@ def parse_invoice(path: str, locale=Locale_en_US()) -> Optional[Order]: return parse_regular_order_invoice(path, locale=locale) -def parse_regular_order_invoice(path: str, locale=Locale_en_US()) -> Order: +def parse_regular_order_invoice(path: str, locale=Locale_en_US) -> Order: errors = [] # type: Errors with open(path, 'rb') as f: soup = bs4.BeautifulSoup(f.read(), 'lxml') @@ -940,7 +940,7 @@ def get_text_lines(parent_node): return text_lines -def parse_digital_order_invoice(path: str, locale=Locale_en_US()) -> Optional[Order]: +def parse_digital_order_invoice(path: str, locale=Locale_en_US) -> Optional[Order]: errors = [] with open(path, 'rb') as f: soup = bs4.BeautifulSoup(f.read(), 'lxml') From a438613199c51eb82be146efa5e07558df2789ec Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sat, 26 Mar 2022 17:14:23 +0100 Subject: [PATCH 33/42] factor out is_items_ordered_header --- beancount_import/source/amazon_invoice.py | 62 ++++++++--------------- 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 2482cd04..2d487abc 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -412,7 +412,21 @@ def reduce_adjustments(adjustments: List[Adjustment]) -> List[Adjustment]: return [ Adjustment(k, reduce_amounts(v)) for k, v in all_adjustments.items() ] - + + +def is_items_ordered_header(node, locale=Locale_en_US) -> bool: + """ + Identify Header of Items Ordered table (within shipment table) + """ + if node.name != 'tr': + return False + tds = node('td') + if len(tds) < 2: + return False + m1 = re.match(locale.items_ordered, tds[0].text.strip()) + m2 = re.match(locale.price, tds[1].text.strip()) + return(m1 is not None and m2 is not None) + def parse_shipments(soup, locale=Locale_en_US) -> List[Shipment]: """ @@ -449,20 +463,8 @@ def is_shipment_header_table(node): items = [] # type: List[Item] shipment_table = header_table.find_parent('table') - - logger.debug('parsing shipment items...') - def is_items_ordered_header(node): - if node.name != 'tr': - return False - tds = node('td') - if len(tds) < 2: - return False - m1 = re.match(locale.items_ordered, tds[0].text.strip()) - m2 = re.match(locale.price, tds[1].text.strip()) - return(m1 is not None and m2 is not None) - - items_ordered_header = shipment_table.find(is_items_ordered_header) - + items_ordered_header = shipment_table.find( + lambda node: is_items_ordered_header(node, locale)) item_rows = items_ordered_header.find_next_siblings('tr') for item_row in item_rows: @@ -570,20 +572,8 @@ def is_gift_card_header_table(node): items = [] # type: List[Item] shipment_table = header_table.find_parent('table') - - logger.debug('parsing gift card items...') - def is_items_ordered_header(node): - if node.name != 'tr': - return False - tds = node('td') - if len(tds) < 2: - return False - m1 = re.match(locale.items_ordered, tds[0].text.strip()) - m2 = re.match(locale.price, tds[1].text.strip()) - return(m1 is not None and m2 is not None) - - items_ordered_header = shipment_table.find(is_items_ordered_header) - + items_ordered_header = shipment_table.find( + lambda node: is_items_ordered_header(node, locale)) item_rows = [items_ordered_header] for item_row in item_rows: @@ -973,18 +963,8 @@ def is_digital_order_row(node): order_date = locale.parse_date(m.group(1)) logger.debug('parsing items...') - def is_items_ordered_header(node): - if node.name != 'tr': - return False - tds = node('td') - if len(tds) < 2: - return False - m1 = re.match(locale.items_ordered, tds[0].text.strip()) - m2 = re.match(locale.price, tds[1].text.strip()) - return(m1 is not None and m2 is not None) - - items_ordered_header = digital_order_table.find(is_items_ordered_header) - + items_ordered_header = digital_order_table.find( + lambda node: is_items_ordered_header(node, locale)) item_rows = items_ordered_header.find_next_siblings('tr') items = [] # List[Item] From 69d146e7c6c2ca2f81350ad8bcd5dd59ad905965 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 27 Mar 2022 16:41:58 +0200 Subject: [PATCH 34/42] clean up locales, add comments --- beancount_import/source/amazon_invoice.py | 27 ++++++++++++----------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 2d487abc..cf71fa7c 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -53,11 +53,11 @@ class Locale_Data(ABC): LOCALE: str tax_included_in_price: bool payee: str + currency: str # only used for assumed prices # common fields regular and digital orders items_ordered: str price: str - currency: str items_subtotal: str total_before_tax: str pretax_adjustment_fields_pattern: str @@ -115,19 +115,19 @@ class Locale_en_US(Locale_Data): LOCALE='en_US' tax_included_in_price=False payee='Amazon.com' + currency='USD' # only used for assumed prices # common fields regular and digital orders - items_ordered='Items Ordered' # shipment + digital - price='Price' # shipment + digital - currency='USD' # shipment only - items_subtotal=r'Item\(s\) Subtotal:' # shipment +digital - total_before_tax='Total Before Tax:' # shipment + digital + items_ordered='Items Ordered' + price='Price' + items_subtotal=r'Item\(s\) Subtotal:' + total_before_tax='Total Before Tax:' pretax_adjustment_fields_pattern=('(?:' + '|'.join([ - 'Shipping & Handling', # Verpackung & Versand: + 'Shipping & Handling', 'Free Shipping', 'Free delivery', 'Pantry delivery', - 'Promotion(?:s| Applied)', # Gutschein eingelöst: + 'Promotion(?:s| Applied)', 'Lightning Deal', 'Your Coupon Savings', '[0-9]+% off savings', @@ -159,6 +159,8 @@ class Locale_en_US(Locale_Data): 'Preparing for Shipment', 'Not Yet Shipped', 'Shipping now' + # unknown shipment statuses will be ignored + # transaction total will not match ] shipment_quantity=r'^\s*(?:(?P[0-9]+)|(?P[0-9.]+\s+(?:lb|kg))|(?:(?P[0-9.]+) [(](?P[^)]+)[)]))\s+of:' shipment_of='of:' @@ -200,11 +202,11 @@ class Locale_de_DE(Locale_Data): LOCALE='de_DE' tax_included_in_price=True # no separate tax transactions payee='Amazon.de' + currency='EUR' # only used for assumed prices # common fields regular and digital orders items_ordered='Bestellte Artikel|Erhalten|Versendet|Amazon-Konto erfolgreich aufgeladen' # Erhalten|Versendet for gift cards price='Preis|Betrag' - currency='EUR' items_subtotal='Zwischensumme:' total_before_tax='Summe ohne MwSt.:' # most of translations still missing ... @@ -236,10 +238,9 @@ class Locale_de_DE(Locale_Data): shipment_nonshipped_headers=[ 'Versand wird vorbereitet', 'Versand in Kürze', - # Translations missing - 'Service completed', - 'Not Yet Shipped', - 'Shipping now' + # additional cases missing? + # unknown shipment statuses will be ignored + # transaction total will not match ] shipment_quantity=r'^\s*(?:(?P[0-9]+)|(?P[0-9.]+\s+(?:lb|kg))|(?:(?P[0-9.]+) [(](?P[^)]+)[)]))\s+Exemplar\(e\)\svon:' shipment_of='Exemplar(e) von:' From 577e196913a6ffc09515984c6fe14c55e95e6ff7 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 27 Mar 2022 16:56:03 +0200 Subject: [PATCH 35/42] add comments and docstrings --- beancount_import/source/amazon_invoice.py | 91 +++++++++++++++++++++-- 1 file changed, 83 insertions(+), 8 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index cf71fa7c..8eb257a9 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -27,7 +27,7 @@ | + parse_credit_card_transactions_from_payments_table(...) | +-> returns Order(..., shipments, ...) | - +-> returns List[Order] + +-> returns Order """ from typing import NamedTuple, Optional, List, Union, Iterable, Dict, Sequence, cast from abc import ABC, abstractmethod @@ -396,6 +396,8 @@ def predicate(node): def get_adjustments_in_table(table, pattern, assumed_currency=None, locale=Locale_en_US): + """ Parse price adjustments in shipping or payment tables. Returns list of adjustments. + """ adjustments = [] for label, amount_str in get_field_in_table( table, pattern, allow_multiple=True, return_label=True): @@ -406,10 +408,14 @@ def get_adjustments_in_table(table, pattern, assumed_currency=None, locale=Local def reduce_adjustments(adjustments: List[Adjustment]) -> List[Adjustment]: + """ Takes list of adjustments and reduces duplicates by summing up the amounts. + """ + # create dict like {adjustment: [amount1, amount2, ...]} all_adjustments = collections.OrderedDict() # type: Dict[str, List[Amount]] for adjustment in adjustments: all_adjustments.setdefault(adjustment.description, []).append(adjustment.amount) + # sum over amounts and convert back to list of Adjustment return [ Adjustment(k, reduce_amounts(v)) for k, v in all_adjustments.items() ] @@ -438,12 +444,13 @@ def is_shipment_header_table(node): return False text = node.text.strip() m = re.match(locale.shipment_shipped_pattern, text) + # return True for both shipped and nonshipped table headers return m is not None or text in locale.shipment_nonshipped_headers header_tables = soup.find_all(is_shipment_header_table) if header_tables is []: - # no shipment table + # no shipment tables # e.g. if only gift cards in order logger.debug('no shipment table found') return [] @@ -628,6 +635,7 @@ def parse_shipment_payments( """ Parse payment information of single shipments and gift card orders. """ logger.debug('parsing shipment amounts...') + # consistency check: shipment subtotal against sum of item prices items_subtotal = locale.parse_amount( get_field_in_table(shipment_table, locale.items_subtotal)) @@ -639,17 +647,19 @@ def parse_shipment_payments( 'expected items subtotal is %r, but parsed value is %r' % (expected_items_subtotal, items_subtotal)) + # parse pre- and posttax adjustments for shipment output_fields = dict() output_fields['pretax_adjustments'] = get_adjustments_in_table( shipment_table, locale.pretax_adjustment_fields_pattern, locale=locale) output_fields['posttax_adjustments'] = get_adjustments_in_table( shipment_table, locale.posttax_adjustment_fields_pattern, locale=locale) + # compare total before tax pretax_parts = [items_subtotal or expected_items_subtotal] + [ a.amount for a in output_fields['pretax_adjustments'] ] + expected_total_before_tax = reduce_amounts(pretax_parts) total_before_tax = locale.parse_amount( get_field_in_table(shipment_table, locale.total_before_tax)) - expected_total_before_tax = reduce_amounts(pretax_parts) if total_before_tax is None: total_before_tax = expected_total_before_tax elif expected_total_before_tax != total_before_tax: @@ -659,12 +669,13 @@ def parse_shipment_payments( sales_tax = get_adjustments_in_table(shipment_table, locale.shipment_sales_tax, locale=locale) + # compare total posttax_parts = ( [total_before_tax] + [a.amount for a in sales_tax] + [a.amount for a in output_fields['posttax_adjustments']]) + expected_total = reduce_amounts(posttax_parts) total = locale.parse_amount( get_field_in_table(shipment_table, locale.shipment_total)) - expected_total = reduce_amounts(posttax_parts) if total is None: total = expected_total elif expected_total != total: @@ -700,6 +711,7 @@ def parse_credit_card_transactions_from_payments_table( for regex in locale.payment_type: m = re.search(regex, payment_text) if m is not None: + # only take first matching regex, discard others! break if m is not None: @@ -760,12 +772,27 @@ def parse_invoice(path: str, locale=Locale_en_US) -> Optional[Order]: def parse_regular_order_invoice(path: str, locale=Locale_en_US) -> Order: + """ Parse regular order type invoice (HTML document) + 1. parse all shipment tables with individual items + 2. parse payment table + 3. sanity check totals extracted from item prices and payment table + """ errors = [] # type: Errors with open(path, 'rb') as f: soup = bs4.BeautifulSoup(f.read(), 'lxml') + + # ---------------------- + # Shipments & Gift Cards + # ---------------------- logger.debug('parsing shipments...') shipments = parse_shipments(soup, locale=locale) + parse_gift_cards(soup, locale=locale) logger.debug('finished parsing shipments') + + # ------------------------------------------- + # Payment Table: Pre- and Posttax Adjustments + # ------------------------------------------- + # Aim: Parse all pre- and posttax adjustments + # consistency check grand total against sum of item costs logger.debug('parsing payment table...') payment_table_header = soup.find( lambda node: node.name == 'table' and re.match( @@ -782,11 +809,14 @@ def parse_regular_order_invoice(path: str, locale=Locale_en_US) -> Order: # older invoices put pre-tax amounts on a per-shipment basis # new invoices only put pre-tax amounts on the overall payments section # detect which this is + + # payment table pretax adjustments pretax_amount = reduce_amounts( a.amount for a in output_fields['pretax_adjustments']) + shipments_pretax_amount = None - if any(s.pretax_adjustments for s in shipments): + # sum over all shipment pretax amounts shipments_pretax_amount = reduce_amounts(a.amount for shipment in shipments for a in shipment.pretax_adjustments) @@ -805,18 +835,29 @@ def parse_regular_order_invoice(path: str, locale=Locale_en_US) -> Order: get_field_in_table(payment_table, locale.regular_total_order)) def resolve_posttax_adjustments(): + """ Extract and compare posttax adjustments + from shipment and payment tables. + Returns list of reduced Adjustments. + """ + # get reduced form of adjustments from payment table payment_adjustments.update( reduce_adjustments( get_adjustments_in_table(payment_table, locale.posttax_adjustment_fields_pattern, assumed_currency=grand_total.currency, locale=locale))) + # adjustments from all shipments, reduced all_shipments_adjustments = collections.OrderedDict( reduce_adjustments( sum((x.posttax_adjustments for x in shipments), []))) + + # initialize dict with all adjustment keys, values not used + # dict ensures that keys are unique all_keys = collections.OrderedDict(payment_adjustments.items()) all_keys.update(all_shipments_adjustments.items()) - + + # combine shipment and payment adjustments + # make sure that shipment adjustments match payment adjustments all_adjustments = collections.OrderedDict() # type: Dict[str, Amount] for key in all_keys: payment_amount = payment_adjustments.get(key) @@ -830,6 +871,7 @@ def resolve_posttax_adjustments(): # Amazon sometimes doesn't include these adjustments in the Shipment table shipments_total_adjustments.append(amount) elif payment_amount != shipments_amount: + # Both tables include adjustment with same label, but amount does not match errors.append( 'expected total %r to be %s, but parsed total is %s' % (key, shipments_amount, payment_amount)) @@ -839,15 +881,17 @@ def resolve_posttax_adjustments(): output_fields['posttax_adjustments'] = resolve_posttax_adjustments() logger.debug('consistency check taxes...') + # tax from payment table tax = locale.parse_amount( get_field_in_table(payment_table, locale.regular_estimated_tax)) + # tax from shipment tables expected_tax = reduce_amounts( a.amount for shipment in shipments for a in shipment.tax) if expected_tax is None: # tax not given on shipment level if not locale.tax_included_in_price: - # add tax if not already included in item prices + # add tax to adjustments if not already included in item prices shipments_total_adjustments.append(tax) elif expected_tax != tax: errors.append( @@ -885,9 +929,13 @@ def is_order_placed_node(node): assert m is not None order_date = locale.parse_date(m.group(1)) + # --------------------------------------- + # Payment Table: Credit Card Transactions + # --------------------------------------- logger.debug('parsing credit card transactions...') credit_card_transactions = parse_credit_card_transactions(soup, locale=locale) if not credit_card_transactions: + # no explicit credit card transaction table logger.debug('no credit card transactions table given, falling back to payments table') credit_card_transactions = parse_credit_card_transactions_from_payments_table( payment_table, order_date, locale=locale) @@ -920,6 +968,8 @@ def is_order_placed_node(node): def get_text_lines(parent_node): + """ Format nodes into list of strings + """ text_lines = [''] for node in parent_node.children: if isinstance(node, bs4.NavigableString): @@ -932,6 +982,11 @@ def get_text_lines(parent_node): def parse_digital_order_invoice(path: str, locale=Locale_en_US) -> Optional[Order]: + """ Parse digital order type invoice (HTML document) + 1. parse all digital items tables + 2. parse amounts + 3. parse payment table + """ errors = [] with open(path, 'rb') as f: soup = bs4.BeautifulSoup(f.read(), 'lxml') @@ -943,6 +998,9 @@ def is_cancelled_order(node): if soup.find(is_cancelled_order): return None + # -------------------------------------------------- + # Find Digital Order Header, parse date and order ID + # -------------------------------------------------- logger.debug('parsing header...') def is_digital_order_row(node): if node.name != 'tr': @@ -956,13 +1014,15 @@ def is_digital_order_row(node): except: return False - # Find Digital Order row digital_order_header = soup.find(is_digital_order_row) digital_order_table = digital_order_header.find_parent('table') m = re.match(locale.digital_order, digital_order_header.text.strip()) assert m is not None order_date = locale.parse_date(m.group(1)) + # ----------- + # Parse Items + # ----------- logger.debug('parsing items...') items_ordered_header = digital_order_table.find( lambda node: is_items_ordered_header(node, locale)) @@ -974,6 +1034,8 @@ def is_digital_order_row(node): for item_row in item_rows: tds = item_row('td') if len(tds) != 2: + # payment information on order level (not payment table) + # differently formatted, take first column only other_fields_td = tds[0] continue description_node = tds[0] @@ -1013,8 +1075,13 @@ def get_label_value(label): other_fields_text_lines = get_text_lines(other_fields_td) + # ------------------------------------------- + # Parse Amounts, Pre- and Posttax Adjustments + # ------------------------------------------- logger.debug('parsing amounts...') def get_other_field(pattern, allow_multiple=False, return_label=False): + """ Look for pattern in other_fields_text_lines + """ results = [] for line in other_fields_text_lines: r = r'^\s*(' + pattern + r')\s+(.*[^\s])\s*$' @@ -1075,6 +1142,8 @@ def get_amounts_in_text(pattern_map): (expected_total, total_for_this_order)) if locale.tax_included_in_price: + # tax is already inlcuded in item prices + # do not add additional transaction for taxes tax = [] shipment = Shipment( @@ -1095,6 +1164,9 @@ def get_amounts_in_text(pattern_map): assert m is not None order_id = m.group(1) + # ------------- + # Payment Table + # ------------- logger.debug('parsing payment information...') payment_table = soup.find( lambda node: node.name == 'table' and @@ -1112,6 +1184,9 @@ def get_amounts_in_text(pattern_map): credit_card_transactions=credit_card_transactions, pretax_adjustments=[], posttax_adjustments=output_fields['posttax_adjustments'], + # tax given on "shipment level" + # for digital orders tax is always given on shipment level + # therefore tax on order level is irrelevant tax=[], errors=[]) From e4b1ee9ecb394a7538cc5312beab9ea9d4d4a85a Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 27 Mar 2022 16:57:23 +0200 Subject: [PATCH 36/42] add logging and error messages --- beancount_import/source/amazon_invoice.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 8eb257a9..ba194d03 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -468,6 +468,7 @@ def is_shipment_header_table(node): assert m is not None shipped_date = locale.parse_date(m.group(1)) + logger.debug('parsing shipment items...') items = [] # type: List[Item] shipment_table = header_table.find_parent('table') @@ -518,6 +519,7 @@ def is_shipment_header_table(node): if m is None: m = re.match(locale.shipment_sold_by, text, re.UNICODE | re.DOTALL) if m is None: + errors.append("Could not extract item from row {}".format(text)) raise Exception("Could not extract item from row", text) description = re.sub(r'\s+', ' ', m.group('description').strip()) @@ -576,7 +578,7 @@ def is_gift_card_header_table(node): errors = [] # type: Errors for header_table in header_tables: - + logger.debug('parsing gift card items...') items = [] # type: List[Item] shipment_table = header_table.find_parent('table') From 8b6de9c1fbeb157f8e9a46731848db3da84b53fb Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 27 Mar 2022 17:00:25 +0200 Subject: [PATCH 37/42] reduce conditionals in parse_credit_card_transactions_from_payments_table --- beancount_import/source/amazon_invoice.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index ba194d03..af78d0ad 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -716,17 +716,17 @@ def parse_credit_card_transactions_from_payments_table( # only take first matching regex, discard others! break - if m is not None: - credit_card_transactions = [ - CreditCardTransaction( - date=order_date, - amount=grand_total, - card_description=m.group(1).strip(), - card_ending_in=m.group(2).strip(), - ) - ] - else: - credit_card_transactions = [] + if m is None: + return [] + + credit_card_transactions = [ + CreditCardTransaction( + date=order_date, + amount=grand_total, + card_description=m.group(1).strip(), + card_ending_in=m.group(2).strip(), + ) + ] return credit_card_transactions From 38fe5971e8dc8bf18dc1e0c1dec6073f2826455d Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 27 Mar 2022 17:04:23 +0200 Subject: [PATCH 38/42] move order ID and date extraction to beginning of parsing method, more logical --- beancount_import/source/amazon_invoice.py | 51 ++++++++++++----------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index af78d0ad..72a3eec4 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -782,6 +782,24 @@ def parse_regular_order_invoice(path: str, locale=Locale_en_US) -> Order: errors = [] # type: Errors with open(path, 'rb') as f: soup = bs4.BeautifulSoup(f.read(), 'lxml') + + # ----------------- + # Order ID & Order placed date + # ----------------- + logger.debug('parsing order id and order placed date...') + title = soup.find('title').text.strip() + m = re.fullmatch(locale.regular_order_id, title.strip()) + assert m is not None + order_id=m.group(1) + + def is_order_placed_node(node): + m = re.fullmatch(locale.regular_order_placed, node.text.strip()) + return m is not None + + node = soup.find(is_order_placed_node) + m = re.fullmatch(locale.regular_order_placed, node.text.strip()) + assert m is not None + order_date = locale.parse_date(m.group(1)) # ---------------------- # Shipments & Gift Cards @@ -921,16 +939,6 @@ def resolve_posttax_adjustments(): errors.append('expected grand total is %s, but parsed value is %s' % (expected_total, adjusted_grand_total)) - logger.debug('parsing order placed date...') - def is_order_placed_node(node): - m = re.fullmatch(locale.regular_order_placed, node.text.strip()) - return m is not None - - node = soup.find(is_order_placed_node) - m = re.fullmatch(locale.regular_order_placed, node.text.strip()) - assert m is not None - order_date = locale.parse_date(m.group(1)) - # --------------------------------------- # Payment Table: Credit Card Transactions # --------------------------------------- @@ -952,15 +960,10 @@ def is_order_placed_node(node): errors.append('total payment amount is %s, but grand total is %s' % (total_payments, adjusted_grand_total)) - logger.debug('parsing order ID...') - title = soup.find('title').text.strip() - m = re.fullmatch(locale.regular_order_id, title.strip()) - assert m is not None - logger.debug('...finished parsing regular invoice.') return Order( order_date=order_date, - order_id=m.group(1), + order_id=order_id, shipments=shipments, credit_card_transactions=credit_card_transactions, tax=tax, @@ -1022,6 +1025,14 @@ def is_digital_order_row(node): assert m is not None order_date = locale.parse_date(m.group(1)) + order_id_td = soup.find( + lambda node: node.name == 'td' and + re.match(locale.digital_order_id, node.text.strip()) + ) + m = re.match(locale.digital_order_id, order_id_td.text.strip()) + assert m is not None + order_id = m.group(1) + # ----------- # Parse Items # ----------- @@ -1158,14 +1169,6 @@ def get_amounts_in_text(pattern_map): errors=errors, **output_fields) - order_id_td = soup.find( - lambda node: node.name == 'td' and - re.match(locale.digital_order_id, node.text.strip()) - ) - m = re.match(locale.digital_order_id, order_id_td.text.strip()) - assert m is not None - order_id = m.group(1) - # ------------- # Payment Table # ------------- From 2287972d37a70d75dfffd5965302825d1276c166 Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 27 Mar 2022 17:08:01 +0200 Subject: [PATCH 39/42] add check and error message if no items were found for an order --- beancount_import/source/amazon_invoice.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 72a3eec4..ff832386 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -806,6 +806,14 @@ def is_order_placed_node(node): # ---------------------- logger.debug('parsing shipments...') shipments = parse_shipments(soup, locale=locale) + parse_gift_cards(soup, locale=locale) + if len(shipments) == 0: + # no shipment or gift card tables found + msg = ('Identified regular order invoice but no items were found ' + + '(neither shipments nor gift cards). This may be a new type. ' + + 'Consider opening an issue at jbms/beancount-import on github.') + logger.warning(msg) + errors.append(msg) + # do not throw exception, continue parsing the payment table logger.debug('finished parsing shipments') # ------------------------------------------- @@ -1022,7 +1030,12 @@ def is_digital_order_row(node): digital_order_header = soup.find(is_digital_order_row) digital_order_table = digital_order_header.find_parent('table') m = re.match(locale.digital_order, digital_order_header.text.strip()) - assert m is not None + if m is None: + msg = ('Identified digital order invoice but no digital orders were found.') + logger.warning(msg) + errors.append(msg) + # throw exception since there is no other possibility to get order_date + assert m is not None order_date = locale.parse_date(m.group(1)) order_id_td = soup.find( From 4be65b33d717c08cf13309efc51dc36c29066f8c Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 27 Mar 2022 17:18:20 +0200 Subject: [PATCH 40/42] fix handling of cases with no tax; tax on Order level is Amount (None possible), tax on shipment level is List --- beancount_import/source/amazon.py | 2 +- beancount_import/source/amazon_invoice.py | 9 +++++++-- testdata/source/amazon/D56-5204779-4181560.json | 2 +- testdata/source/amazon/de_DE/071-4816388-0694813.json | 2 +- testdata/source/amazon/de_DE/075-2225405-7594823.json | 2 +- testdata/source/amazon/de_DE/142-4912939-2196263.json | 2 +- testdata/source/amazon/de_DE/256-0244967-2403944.json | 2 +- testdata/source/amazon/de_DE/393-2608279-9292916.json | 2 +- testdata/source/amazon/de_DE/399-5779972-5007935.json | 2 +- testdata/source/amazon/de_DE/447-6209054-6766419.json | 2 +- testdata/source/amazon/de_DE/588-8509154-9761865.json | 2 +- testdata/source/amazon/de_DE/898-5185906-0096901.json | 2 +- testdata/source/amazon/de_DE/974-6135682-9358749.json | 2 +- testdata/source/amazon/de_DE/D22-9220967-2566135.json | 2 +- testdata/source/amazon/de_DE/D60-9825125-4795642.json | 2 +- 15 files changed, 21 insertions(+), 16 deletions(-) diff --git a/beancount_import/source/amazon.py b/beancount_import/source/amazon.py index e8f801f3..f768dab3 100644 --- a/beancount_import/source/amazon.py +++ b/beancount_import/source/amazon.py @@ -395,7 +395,7 @@ def make_amazon_transaction( (INVOICE_DESCRIPTION, adjustment.description), ]), )) - if len(invoice.tax)>0 and invoice.tax.number != ZERO: + if invoice.tax is not None and invoice.tax.number != ZERO: txn.postings.append( Posting( account=unknown_account_name, diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index ff832386..49348a00 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -671,6 +671,11 @@ def parse_shipment_payments( sales_tax = get_adjustments_in_table(shipment_table, locale.shipment_sales_tax, locale=locale) + if locale.tax_included_in_price: + # tax is already inlcuded in item prices + # do not add additional Adjustment for taxes + sales_tax = [] + # compare total posttax_parts = ( [total_before_tax] + [a.amount for a in sales_tax] + @@ -928,7 +933,7 @@ def resolve_posttax_adjustments(): if locale.tax_included_in_price: # tax is already inlcuded in item prices # do not add additional transaction for taxes - tax = [] + tax = None logger.debug('consistency check grand total...') payments_total_adjustment = reduce_amounts(payments_total_adjustments) @@ -1205,7 +1210,7 @@ def get_amounts_in_text(pattern_map): # tax given on "shipment level" # for digital orders tax is always given on shipment level # therefore tax on order level is irrelevant - tax=[], + tax=None, errors=[]) diff --git a/testdata/source/amazon/D56-5204779-4181560.json b/testdata/source/amazon/D56-5204779-4181560.json index 35b62d78..09aac4e8 100644 --- a/testdata/source/amazon/D56-5204779-4181560.json +++ b/testdata/source/amazon/D56-5204779-4181560.json @@ -54,7 +54,7 @@ } ], "pretax_adjustments": [], - "tax": [], + "tax": null, "posttax_adjustments": [], "errors": [] } diff --git a/testdata/source/amazon/de_DE/071-4816388-0694813.json b/testdata/source/amazon/de_DE/071-4816388-0694813.json index 48263f5b..d1b68d64 100644 --- a/testdata/source/amazon/de_DE/071-4816388-0694813.json +++ b/testdata/source/amazon/de_DE/071-4816388-0694813.json @@ -43,7 +43,7 @@ } ], "pretax_adjustments": [], - "tax": [], + "tax": null, "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/075-2225405-7594823.json b/testdata/source/amazon/de_DE/075-2225405-7594823.json index 4fa08e1b..fc4caa62 100644 --- a/testdata/source/amazon/de_DE/075-2225405-7594823.json +++ b/testdata/source/amazon/de_DE/075-2225405-7594823.json @@ -43,7 +43,7 @@ } ], "pretax_adjustments": [], - "tax": [], + "tax": null, "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/142-4912939-2196263.json b/testdata/source/amazon/de_DE/142-4912939-2196263.json index fbc13d44..b67120d2 100644 --- a/testdata/source/amazon/de_DE/142-4912939-2196263.json +++ b/testdata/source/amazon/de_DE/142-4912939-2196263.json @@ -51,7 +51,7 @@ } } ], - "tax": [], + "tax": null, "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/256-0244967-2403944.json b/testdata/source/amazon/de_DE/256-0244967-2403944.json index 611b9e79..eaaa047d 100644 --- a/testdata/source/amazon/de_DE/256-0244967-2403944.json +++ b/testdata/source/amazon/de_DE/256-0244967-2403944.json @@ -51,7 +51,7 @@ } } ], - "tax": [], + "tax": null, "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/393-2608279-9292916.json b/testdata/source/amazon/de_DE/393-2608279-9292916.json index 91b59997..fbc6ad9c 100644 --- a/testdata/source/amazon/de_DE/393-2608279-9292916.json +++ b/testdata/source/amazon/de_DE/393-2608279-9292916.json @@ -51,7 +51,7 @@ } } ], - "tax": [], + "tax": null, "posttax_adjustments": [ { "description": "Gutschein eingel\u00f6st", diff --git a/testdata/source/amazon/de_DE/399-5779972-5007935.json b/testdata/source/amazon/de_DE/399-5779972-5007935.json index 4efacf86..e27e7f55 100644 --- a/testdata/source/amazon/de_DE/399-5779972-5007935.json +++ b/testdata/source/amazon/de_DE/399-5779972-5007935.json @@ -51,7 +51,7 @@ } } ], - "tax": [], + "tax": null, "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/447-6209054-6766419.json b/testdata/source/amazon/de_DE/447-6209054-6766419.json index 289c89b9..1ce751c2 100644 --- a/testdata/source/amazon/de_DE/447-6209054-6766419.json +++ b/testdata/source/amazon/de_DE/447-6209054-6766419.json @@ -43,7 +43,7 @@ } ], "pretax_adjustments": [], - "tax": [], + "tax": null, "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/588-8509154-9761865.json b/testdata/source/amazon/de_DE/588-8509154-9761865.json index 4fe63119..370a8b3d 100644 --- a/testdata/source/amazon/de_DE/588-8509154-9761865.json +++ b/testdata/source/amazon/de_DE/588-8509154-9761865.json @@ -51,7 +51,7 @@ } } ], - "tax": [], + "tax": null, "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/898-5185906-0096901.json b/testdata/source/amazon/de_DE/898-5185906-0096901.json index da21a0fb..228b2f36 100644 --- a/testdata/source/amazon/de_DE/898-5185906-0096901.json +++ b/testdata/source/amazon/de_DE/898-5185906-0096901.json @@ -51,7 +51,7 @@ } } ], - "tax": [], + "tax": null, "posttax_adjustments": [ { "description": "Gutschein eingel\u00f6st", diff --git a/testdata/source/amazon/de_DE/974-6135682-9358749.json b/testdata/source/amazon/de_DE/974-6135682-9358749.json index 190107f0..23e63ce1 100644 --- a/testdata/source/amazon/de_DE/974-6135682-9358749.json +++ b/testdata/source/amazon/de_DE/974-6135682-9358749.json @@ -125,7 +125,7 @@ } } ], - "tax": [], + "tax": null, "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/D22-9220967-2566135.json b/testdata/source/amazon/de_DE/D22-9220967-2566135.json index 8c3ea1b4..5c7a6da1 100644 --- a/testdata/source/amazon/de_DE/D22-9220967-2566135.json +++ b/testdata/source/amazon/de_DE/D22-9220967-2566135.json @@ -29,7 +29,7 @@ ], "credit_card_transactions": [], "pretax_adjustments": [], - "tax": [], + "tax": null, "posttax_adjustments": [], "errors": [] } \ No newline at end of file diff --git a/testdata/source/amazon/de_DE/D60-9825125-4795642.json b/testdata/source/amazon/de_DE/D60-9825125-4795642.json index 1b57f6c3..82072efa 100644 --- a/testdata/source/amazon/de_DE/D60-9825125-4795642.json +++ b/testdata/source/amazon/de_DE/D60-9825125-4795642.json @@ -29,7 +29,7 @@ ], "credit_card_transactions": [], "pretax_adjustments": [], - "tax": [], + "tax": null, "posttax_adjustments": [], "errors": [] } \ No newline at end of file From 41a14fac0e243c969782cc144bab5a18633f0e3d Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 27 Mar 2022 17:27:53 +0200 Subject: [PATCH 41/42] update types, fix Shipment tax type --- beancount_import/source/amazon.py | 6 ++-- beancount_import/source/amazon_invoice.py | 35 ++++++++++++----------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/beancount_import/source/amazon.py b/beancount_import/source/amazon.py index f768dab3..fc89f3d8 100644 --- a/beancount_import/source/amazon.py +++ b/beancount_import/source/amazon.py @@ -264,7 +264,7 @@ """ import collections -from typing import Dict, List, Tuple, Optional +from typing import Dict, List, Tuple, Optional, Union import os import sys import pickle @@ -303,7 +303,7 @@ def make_amazon_transaction( - invoice, + invoice: Order, posttax_adjustment_accounts, credit_card_accounts, amazon_account: str, @@ -330,7 +330,7 @@ def make_amazon_transaction( meta = collections.OrderedDict([ (ITEM_DESCRIPTION_KEY, item.description), (SELLER_KEY, item.sold_by), - ]) + ]) # type: Dict[str, Optional[Union[str, datetime.date]]] if isinstance(item, DigitalItem): if item.url: meta[ITEM_URL_KEY] = item.url diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 49348a00..95e6e668 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -323,10 +323,10 @@ def parse_date(date_str) -> datetime.date: ('shipped_date', Optional[datetime.date]), ('items', Sequence[Union[Item, DigitalItem]]), ('items_subtotal', Amount), - ('pretax_adjustments', Sequence[Adjustment]), + ('pretax_adjustments', List[Adjustment]), ('total_before_tax', Amount), ('posttax_adjustments', Sequence[Adjustment]), - ('tax', Amount), + ('tax', List[Adjustment]), ('total', Amount), ('errors', Errors), ]) @@ -395,7 +395,8 @@ def predicate(node): return results -def get_adjustments_in_table(table, pattern, assumed_currency=None, locale=Locale_en_US): +def get_adjustments_in_table( + table, pattern, assumed_currency=None, locale=Locale_en_US) -> List[Adjustment]: """ Parse price adjustments in shipping or payment tables. Returns list of adjustments. """ adjustments = [] @@ -407,7 +408,7 @@ def get_adjustments_in_table(table, pattern, assumed_currency=None, locale=Local return adjustments -def reduce_adjustments(adjustments: List[Adjustment]) -> List[Adjustment]: +def reduce_adjustments(adjustments: Sequence[Adjustment]) -> Sequence[Adjustment]: """ Takes list of adjustments and reduces duplicates by summing up the amounts. """ # create dict like {adjustment: [amount1, amount2, ...]} @@ -704,7 +705,7 @@ def parse_shipment_payments( def parse_credit_card_transactions_from_payments_table( payment_table, order_date: datetime.date, - locale=Locale_en_US) -> List[CreditCardTransaction]: + locale=Locale_en_US) -> Sequence[CreditCardTransaction]: """ Parse payment information from payments table. Only type and last digits are given, no amount (assuming grand total). Other payment methods than credit card are possible: @@ -735,7 +736,7 @@ def parse_credit_card_transactions_from_payments_table( return credit_card_transactions -def parse_credit_card_transactions(soup, locale=Locale_en_US) -> List[CreditCardTransaction]: +def parse_credit_card_transactions(soup, locale=Locale_en_US) -> Sequence[CreditCardTransaction]: """ Parse Credit Card Transactions from bottom sub-table of payments table. Transactions are listed with type, 4 digits, transaction date and amount. """ @@ -748,7 +749,7 @@ def is_header_node(node): return [] sibling = header_node.find_next_sibling('td') rows = sibling.find_all('tr') - transactions = [] + transactions = [] # type: List[CreditCardTransaction] for row in rows: if not row.text.strip(): continue @@ -834,11 +835,10 @@ def is_order_placed_node(node): payment_table = payment_table_header.find_parent('table') logger.debug('parsing pretax adjustments...') - output_fields = dict() + output_fields = dict() # type: Dict[str, List[Adjustment]] output_fields['pretax_adjustments'] = get_adjustments_in_table( payment_table, locale.pretax_adjustment_fields_pattern, locale=locale) - payment_adjustments = collections.OrderedDict() # type: Dict[str, Amount] - + # older invoices put pre-tax amounts on a per-shipment basis # new invoices only put pre-tax amounts on the overall payments section # detect which this is @@ -860,14 +860,15 @@ def is_order_placed_node(node): % (shipments_pretax_amount, pretax_amount)) logger.debug('parsing posttax adjustments...') - payments_total_adjustments = [] - shipments_total_adjustments = [] - # parse first to get an idea of the working currency grand_total = locale.parse_amount( get_field_in_table(payment_table, locale.regular_total_order)) - def resolve_posttax_adjustments(): + payment_adjustments = collections.OrderedDict() # type: Dict[str, Amount] + payments_total_adjustments = [] # type: List[Amount] + shipments_total_adjustments = [] # type: List[Amount] + + def resolve_posttax_adjustments() -> List[Adjustment]: """ Extract and compare posttax adjustments from shipment and payment tables. Returns list of reduced Adjustments. @@ -1005,7 +1006,7 @@ def parse_digital_order_invoice(path: str, locale=Locale_en_US) -> Optional[Orde 2. parse amounts 3. parse payment table """ - errors = [] + errors = [] # type: Errors with open(path, 'rb') as f: soup = bs4.BeautifulSoup(f.read(), 'lxml') @@ -1058,8 +1059,8 @@ def is_digital_order_row(node): items_ordered_header = digital_order_table.find( lambda node: is_items_ordered_header(node, locale)) item_rows = items_ordered_header.find_next_siblings('tr') - items = [] # List[Item] - + + items = [] # Sequence[DigitalItem] other_fields_td = None for item_row in item_rows: From 66298c17933de778a457449286e63b5c4bb9f32a Mon Sep 17 00:00:00 2001 From: Moritz Jung <18733473+moritzj29@users.noreply.github.com> Date: Sun, 27 Mar 2022 17:46:02 +0200 Subject: [PATCH 42/42] make parse_gift_cards optional --- beancount_import/source/amazon_invoice.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py index 95e6e668..cc533f03 100644 --- a/beancount_import/source/amazon_invoice.py +++ b/beancount_import/source/amazon_invoice.py @@ -84,9 +84,9 @@ class Locale_Data(ABC): regular_estimated_tax: str regular_order_placed: str regular_order_id: str - gift_card: str - gift_card_to: str - gift_card_amazon_account: str + gift_card: Optional[str] + gift_card_to: Optional[str] + gift_card_amazon_account: Optional[str] # digital orders only digital_order: str @@ -173,9 +173,6 @@ class Locale_en_US(Locale_Data): regular_estimated_tax = 'Estimated tax to be collected:' regular_order_placed=r'(?:Subscribe and Save )?Order Placed:\s+([^\s]+ \d+, \d{4})' regular_order_id=r'.*Order ([0-9\-]+)' - gift_card='Gift Cards' # not confirmed yet! - gift_card_to=r'^(?PGift Card)[\w\s-]*:\s*(?P[\w@._-]*)$' # guess, not confirmed yet! - gift_card_amazon_account=r'^[\w\s-]*(?PAmazon-Account)[\w\s-]*(?Pcharged up)[\w\s-]*$' # guess, not confirmed yet! # digital orders only digital_order='Digital Order: (.*)' @@ -811,7 +808,9 @@ def is_order_placed_node(node): # Shipments & Gift Cards # ---------------------- logger.debug('parsing shipments...') - shipments = parse_shipments(soup, locale=locale) + parse_gift_cards(soup, locale=locale) + shipments = parse_shipments(soup, locale=locale) + if hasattr(locale, 'gift_card'): + shipments += parse_gift_cards(soup, locale=locale) if len(shipments) == 0: # no shipment or gift card tables found msg = ('Identified regular order invoice but no items were found '