Skip to content

Commit

Permalink
Making sure all the post consume scripts use the new cli format, and …
Browse files Browse the repository at this point in the history
…also speeding things up by caching id->name mappings for document types, storage paths, and correspondents
  • Loading branch information
Jeremy Gillula committed Jul 8, 2023
1 parent 4990b23 commit 8c28df2
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 12 deletions.
10 changes: 10 additions & 0 deletions paperlessngx_postprocessor/paperless_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def __init__(self, api_url, auth_token, paperless_src_dir, logger=None):
logging.debug(f"Auth token {auth_token} acquired")

self._auth_token = auth_token
self._cache = {}
self._cachable_types = ["correspondents", "document_types", "storage_paths", "tags"]

def delete_document_by_id(self, document_id):
item_type = "documents"
Expand All @@ -48,6 +50,11 @@ def _get_item_by_id(self, item_type, item_id):
return {}

def _get_list(self, item_type, query=None):
# If the given item type has been cached, return it
if item_type in self._cache and query is None:
self._logger.debug(f"Returning {item_type} list from cache")
return self._cache[item_type]

items = []
next_url = f"{self._api_url}/{item_type}/"
if query is not None:
Expand All @@ -62,6 +69,9 @@ def _get_list(self, item_type, query=None):
else:
next_url = None

if item_type in self._cachable_types:
self._cache[item_type] = items

return items

def get_item_id_by_name(self, item_type, item_name):
Expand Down
25 changes: 17 additions & 8 deletions paperlessngx_postprocessor/postprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def _normalize_created_dates(self, new_metadata, old_metadata):
result["created_day"] = self._normalize_day(new_metadata["created_day"], old_metadata["created_day"])

original_created_date = dateutil.parser.isoparse(old_metadata["created"])
new_created_date = datetime(int(result["created_year"]), int(result["created_month"]), int(result["created_day"]), 12, tzinfo=original_created_date.tzinfo)
new_created_date = datetime(int(result["created_year"]), int(result["created_month"]), int(result["created_day"]), original_created_date.hour, tzinfo=original_created_date.tzinfo)
result["created"] = new_created_date.isoformat()
result["created_date"] = new_created_date.strftime("%F") # %F means YYYY-MM-DD
result["created_date_object"] = date(int(result["created_year"]), int(result["created_month"]), int(result["created_day"]))
Expand All @@ -161,9 +161,11 @@ def validate(self, metadata):

# Try to apply the validation rule
if self._validation_rule is not None:
self._logger.debug(f"Validating for rule {self.name}")
self._logger.debug(f"Validating for rule {self.name} using metadata={metadata}")
template = self._env.from_string(self._validation_rule)
valid = (template.render(**metadata).strip() != "False")
template_result = template.render(**metadata).strip()
self._logger.debug(f"Validation template rendered to '{template_result}'")
valid = (template_result != "False")
if not valid:
self._logger.warning(f"Failed validation rule '{self._validation_rule}'")
else:
Expand All @@ -179,7 +181,8 @@ def get_new_metadata(self, metadata, content):
"added",
"added_year",
"added_month",
"added_day"]
"added_day",
"document_id"]
read_only_metadata = {key: metadata[key] for key in read_only_metadata_keys if key in metadata}
writable_metadata_keys = list(set(metadata.keys()) - set(read_only_metadata_keys))
writable_metadata = {key: metadata[key] for key in writable_metadata_keys if key in metadata}
Expand All @@ -194,8 +197,8 @@ def get_new_metadata(self, metadata, content):
writable_metadata = self._normalize_created_dates(writable_metadata, metadata)
self._logger.debug(f"Regex results are {writable_metadata}")
else:
self._logger.warning(f"Regex '{self._metadata_regex}' for '{self.name}' didn't match")
self._logger.warning(f"Regex '{self._metadata_regex}' for '{self.name}' didn't match for document_id={metadata['document_id']}")

# Cycle throguh the postprocessing rules
if self._metadata_postprocessing is not None:
for variable_name in self._metadata_postprocessing.keys():
Expand Down Expand Up @@ -275,6 +278,7 @@ def _validate(self, metadata_in_filename_format):

def postprocess(self, documents):
backup_documents = []
num_invalid = 0
for document in documents:
metadata_in_filename_format = self._api.get_metadata_in_filename_format(document)
self._logger.debug(f"metadata_in_filename_format={metadata_in_filename_format}")
Expand Down Expand Up @@ -303,10 +307,12 @@ def postprocess(self, documents):
self._logger.info(f"No changes for document_id={document['id']}")

if (not self._skip_validation) and (self._invalid_tag_id is not None):
metadata_in_filename_format = self._api.get_metadata_in_filename_format(document)
# Note that we have to refetch the document here to get the changes we just applied from postprocessing
metadata_in_filename_format = self._api.get_metadata_in_filename_format(self._api.get_document_by_id(document['id']))
metadata = self._api.get_metadata_from_filename_format(metadata_in_filename_format)
valid = self._validate(metadata_in_filename_format)
if not valid:
num_invalid += 1
metadata["tags"].append(self._invalid_tag_id)
self._logger.warning(f"document_id={document['id']} is invalid, adding tag {self._invalid_tag_id}")
if not self._dry_run:
Expand All @@ -317,7 +323,10 @@ def postprocess(self, documents):
else:
self._logger.info(f"document_id={document['id']} is valid")
else:
self._logger.info("Validation was skipped")
self._logger.info(f"Validation was skipped since invalid_tag_id={self._invalid_tag_id} and skip_validation={self._skip_validation}")

if num_invalid > 0:
self._logger.warning(f"Found {num_invalid}/{len(documents)} invalid documents")

return backup_documents

Expand Down
2 changes: 1 addition & 1 deletion post_consume_cid_fixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
if __name__ == "__main__":
document_id = os.environ["DOCUMENT_ID"]

config = Config()
config = Config(Config.general_options())
logging.basicConfig(format="[%(asctime)s] [%(levelname)s] [%(module)s] %(message)s", level=config["verbose"])

api = PaperlessAPI(config["paperless_api_url"],
Expand Down
5 changes: 3 additions & 2 deletions post_consume_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@

if document_id is not None:
subprocess.run((str(Path(directory)/"paperlessngx_postprocessor.py"),
"document_id",
"process",
"--document-id",
document_id))

post_consume_script = os.environ.get("PNGX_POSTPROCESSOR_POST_CONSUME_SCRIPT")
if post_consume_script is not None:
logging.basicConfig(format="[%(asctime)s] [%(levelname)s] [%(module)s] %(message)s")

config = Config()
config = Config(Config.general_options())

logging.getLogger().setLevel(config["verbose"])

Expand Down
2 changes: 1 addition & 1 deletion post_consume_title_change_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

new_filename = Path(os.environ["DOCUMENT_SOURCE_PATH"]).name
if old_filename != new_filename:
config = Config()
config = Config(Config.general_options())
api = PaperlessAPI(config["paperless_api_url"],
auth_token = config["auth_token"],
paperless_src_dir = config["paperless_src_dir"])
Expand Down

0 comments on commit 8c28df2

Please sign in to comment.