Skip to content

Commit

Permalink
Merge pull request #559 from robertatakenaka/resolve_problemas_de_htm…
Browse files Browse the repository at this point in the history
…l2xml_body_and_back_ausentes_e_caracteres_especiais_em_referencias

Resolve problemas de html2xml body and back ausentes e caracteres especiais em referencias
  • Loading branch information
robertatakenaka authored Oct 29, 2024
2 parents 7d59ce6 + 3cd76ac commit d4339a1
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 23 deletions.
1 change: 0 additions & 1 deletion article/wagtail_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ class ArticleModelAdmin(ModelAdmin):
"pid_v3",
"status",
"display_sections",
"order",
"fpage",
"position",
"first_publication_date",
Expand Down
85 changes: 68 additions & 17 deletions htmlxml/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
import sys

from django.core.files.base import ContentFile
from django.db import models
Expand Down Expand Up @@ -503,6 +504,17 @@ class Meta:
models.Index(fields=["migrated_article"]),
]

@property
def data(self):
return {
"html2xml_status": self.html2xml_status,
"n_paragraphs": self.n_paragraphs,
"n_references": self.n_references,
"record_types": self.record_types,
"html_translation_langs": self.html_translation_langs,
"pdf_langs": self.pdf_langs,
}

@property
def directory_path(self):
return f"classic_website/{self.migrated_article.collection.acron}/html2xml/{self.migrated_article.path}"
Expand Down Expand Up @@ -566,6 +578,7 @@ def html_to_xml(
body_and_back_xml,
):
try:
op = article_proc.start(user, "html_to_xml")
self.html2xml_status = tracker_choices.PROGRESS_STATUS_DOING
self.html_translation_langs = "-".join(
sorted(article_proc.translations.keys())
Expand All @@ -580,6 +593,7 @@ def html_to_xml(
)
self.save()

detail = {}
document = Document(article_proc.migrated_data.data)
document._translated_html_by_lang = article_proc.translations

Expand All @@ -588,18 +602,37 @@ def html_to_xml(
)
xml_content = self._generate_xml_from_html(user, article_proc, document)

if xml_content and body_and_back:
detail = {"xml_content": bool(xml_content), "body_and_back": bool(body_and_back)}
completed = bool(xml_content and body_and_back)
if completed:
self.html2xml_status = tracker_choices.PROGRESS_STATUS_DONE
elif xml_content:
self.html2xml_status = tracker_choices.PROGRESS_STATUS_PENDING
else:
self.html2xml_status = tracker_choices.PROGRESS_STATUS_BLOCKED
self.html2xml_status = tracker_choices.PROGRESS_STATUS_PENDING
self.save()

op.finish(
user,
completed=completed,
exception=None,
message_type=None,
message=None,
exc_traceback=None,
detail=detail,
)
except Exception as e:
exc_type, exc_value, exc_traceback = sys.exc_info()

self.html2xml_status = tracker_choices.PROGRESS_STATUS_BLOCKED
self.save()
raise e
self.generate_report(user, article_proc)
op.finish(
user,
completed=False,
exception=e,
message_type=None,
message=None,
exc_traceback=exc_traceback,
detail=detail,
)
return xml_content

@property
Expand All @@ -617,8 +650,9 @@ def latest_bb_file(self):
return ""

def generate_report(self, user, article_proc):
op = article_proc.start(user, "generate html xml report")
op = article_proc.start(user, "html_to_xml: generate report")
try:
detail = {}
html = _fromstring(self.first_bb_file)

for xml_with_pre in XMLWithPre.create(path=self.file.path):
Expand All @@ -641,28 +675,36 @@ def generate_report(self, user, article_proc):
},
)
except Exception as e:
op.finish(user, completed=False, detail={"error": str(e)})
exc_type, exc_value, exc_traceback = sys.exc_info()
op.finish(
user,
completed=False,
exception=e,
message_type=None,
message=None,
exc_traceback=exc_traceback,
detail=detail,
)

def _generate_xml_body_and_back(self, user, article_proc, document):
"""
Generate XML body and back from html_translation_langs and p records
"""
done = False
operation = article_proc.start(user, "generate xml body and back")
operation = article_proc.start(user, "html_to_xml: generate xml body + back")

languages = document._translated_html_by_lang
detail = {}
detail.update(languages)

try:
document.generate_body_and_back_from_html(languages)
done = True
# guarda cada versão de body/back
except GenerateBodyAndBackFromHTMLError as e:
# cria xml_body_and_back padrão
document.xml_body_and_back = ["<article/>"]
detail = {"warning": str(e)}
document.xml_body_and_back = ["<article><body/><back/></article>"]
done = False

# guarda cada versão de body/back
if document.xml_body_and_back:
for i, xml_body_and_back in enumerate(document.xml_body_and_back, start=1):
BodyAndBackFile.create_or_update(
Expand All @@ -677,18 +719,27 @@ def _generate_xml_body_and_back(self, user, article_proc, document):
return done

def _generate_xml_from_html(self, user, article_proc, document):
operation = article_proc.start(user, "_generate_xml_from_html")
operation = article_proc.start(user, "html_to_xml: merge front + body + back")
xml_content = None
detail = {}
try:
xml_content = document.generate_full_xml(None).decode("utf-8")
xml_file = article_proc.pkg_name + ".xml"
self.save_file(xml_file, xml_content)
detail["xml"] = xml_file
operation.finish(user, bool(xml_content), detail=detail)
return xml_content
except Exception as e:
detail = {"error": str(e)}
operation.finish(user, bool(xml_content), detail=detail)
return xml_content
exc_type, exc_value, exc_traceback = sys.exc_info()
operation.finish(
user,
completed=False,
exception=e,
message_type=None,
message=None,
exc_traceback=exc_traceback,
detail=detail,
)

def save_report(self, content):
# content = json.dumps(data)
Expand Down
9 changes: 7 additions & 2 deletions proc/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1423,6 +1423,9 @@ def get_xml(self, user, body_and_back_xml):
self.migrated_data.file_type = self.migrated_data.document.file_type
self.migrated_data.save()

detail = {}
detail["file_type"] = self.migrated_data.file_type

if self.migrated_data.file_type == "html":
migrated_data = self.migrated_data
classic_ws_doc = migrated_data.document
Expand All @@ -1433,17 +1436,19 @@ def get_xml(self, user, body_and_back_xml):
record_types="|".join(classic_ws_doc.record_types or []),
)
htmlxml.html_to_xml(user, self, body_and_back_xml)
htmlxml.generate_report(user, self)
detail.update(htmlxml.data)

xml = get_migrated_xml_with_pre(self)

if xml:
self.xml_status = tracker_choices.PROGRESS_STATUS_DONE
detail.update(xml.data)
else:
self.xml_status = tracker_choices.PROGRESS_STATUS_REPROC
self.save()

completed = self.xml_status == tracker_choices.PROGRESS_STATUS_DONE
operation.finish(user, completed=completed, detail=xml and xml.data)
operation.finish(user, completed=completed, detail=detail)
return completed
except Exception as e:
exc_type, exc_value, exc_traceback = sys.exc_info()
Expand Down
1 change: 1 addition & 0 deletions proc/wagtail_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ class ArticleProcModelAdmin(ModelAdmin):
edit_view_class = ProcEditView
list_per_page = 10
list_display = (
"__str__",
"pkg_name",
"issue_proc",
"xml_status",
Expand Down
2 changes: 1 addition & 1 deletion publication/api/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def publish_article(article_proc, api_data, journal_pid=None):
raise ValueError(
"publication.api.document.publish_article requires journal_pid")

order = article_proc.article.order
order = article_proc.article.position
pub_date = article_proc.article.first_publication_date or datetime.utcnow()

build_article(builder, article_proc.article, journal_pid, order, pub_date)
Expand Down
2 changes: 1 addition & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ mongoengine==0.28.2
aiohttp==3.9.1
# DSM Migration
# ------------------------------------------------------------------------------
-e git+https://github.com/scieloorg/[email protected].4#egg=scielo_classic_website
-e git+https://github.com/scieloorg/[email protected].5#egg=scielo_classic_website
python-dateutil==2.8.2
tornado>=6.3.2 # not directly required, pinned by Snyk to avoid a vulnerability

Expand Down
2 changes: 1 addition & 1 deletion tracker/choices.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@


def allowed_to_run(status, force_update):
return force_update and status in PROGRESS_STATUS_FORCE_UPDATE or status in PROGRESS_STATUS_TODO
return force_update and status in PROGRESS_STATUS_FORCE_UPDATE or status in PROGRESS_STATUS_REGULAR_TODO

0 comments on commit d4339a1

Please sign in to comment.