Skip to content

Commit

Permalink
Merge 8c6f701 into d45f98c
Browse files Browse the repository at this point in the history
  • Loading branch information
medariox authored May 27, 2023
2 parents d45f98c + 8c6f701 commit 0f15b53
Show file tree
Hide file tree
Showing 141 changed files with 4,413 additions and 2,693 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## 1.0.16 (27-05-2023)

#### Improvements
- Raise warning when TVDB returns malformed data
- Update many JavaScript and Python dependencies

-----

## 1.0.15 (21-05-2023)

#### Fixes
Expand Down
59 changes: 43 additions & 16 deletions ext/bs4/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"""

__author__ = "Leonard Richardson ([email protected])"
__version__ = "4.11.2"
__version__ = "4.12.2"
__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
Expand All @@ -38,11 +38,13 @@
builder_registry,
ParserRejectedMarkup,
XMLParsedAsHTMLWarning,
HTMLParserTreeBuilder
)
from .dammit import UnicodeDammit
from .element import (
CData,
Comment,
CSS,
DEFAULT_OUTPUT_ENCODING,
Declaration,
Doctype,
Expand Down Expand Up @@ -116,7 +118,7 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'

NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"

def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
element_classes=None, **kwargs):
Expand Down Expand Up @@ -348,25 +350,49 @@ def deprecated_argument(old_name, new_name):
self.markup = None
self.builder.soup = None

def __copy__(self):
"""Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
copy = type(self)(
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
)
def _clone(self):
"""Create a new BeautifulSoup object with the same TreeBuilder,
but not associated with any markup.
# Although we encoded the tree to UTF-8, that may not have
# been the encoding of the original markup. Set the copy's
# .original_encoding to reflect the original object's
# .original_encoding.
copy.original_encoding = self.original_encoding
return copy
This is the first step of the deepcopy process.
"""
clone = type(self)("", None, self.builder)

# Keep track of the encoding of the original document,
# since we won't be parsing it again.
clone.original_encoding = self.original_encoding
return clone

def __getstate__(self):
# Frequently a tree builder can't be pickled.
d = dict(self.__dict__)
if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
d['builder'] = None
d['builder'] = type(self.builder)
# Store the contents as a Unicode string.
d['contents'] = []
d['markup'] = self.decode()

# If _most_recent_element is present, it's a Tag object left
# over from initial parse. It might not be picklable and we
# don't need it.
if '_most_recent_element' in d:
del d['_most_recent_element']
return d

def __setstate__(self, state):
# If necessary, restore the TreeBuilder by looking it up.
self.__dict__ = state
if isinstance(self.builder, type):
self.builder = self.builder()
elif not self.builder:
# We don't know which builder was used to build this
# parse tree, so use a default we know is always available.
self.builder = HTMLParserTreeBuilder()
self.builder.soup = self
self.reset()
self._feed()
return state


@classmethod
def _decode_markup(cls, markup):
Expand Down Expand Up @@ -468,6 +494,7 @@ def reset(self):
self.open_tag_counter = Counter()
self.preserve_whitespace_tag_stack = []
self.string_container_stack = []
self._most_recent_element = None
self.pushTag(self)

def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
Expand Down Expand Up @@ -749,7 +776,7 @@ def handle_data(self, data):

def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
formatter="minimal", iterator=None):
"""Returns a string or Unicode representation of the parse tree
as an HTML or XML document.
Expand All @@ -776,7 +803,7 @@ def decode(self, pretty_print=False,
else:
indent_level = 0
return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter)
indent_level, eventual_encoding, formatter, iterator)

# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
_s = BeautifulSoup
Expand Down
25 changes: 24 additions & 1 deletion ext/bs4/builder/_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from bs4.builder import (
DetectsXMLParsedAsHTML,
ParserRejectedMarkup,
HTML,
HTMLTreeBuilder,
STRICT,
Expand Down Expand Up @@ -70,6 +71,22 @@ def __init__(self, *args, **kwargs):

self._initialize_xml_detector()

def error(self, message):
# NOTE: This method is required so long as Python 3.9 is
# supported. The corresponding code is removed from HTMLParser
# in 3.5, but not removed from ParserBase until 3.10.
# https://github.com/python/cpython/issues/76025
#
# The original implementation turned the error into a warning,
# but in every case I discovered, this made HTMLParser
# immediately crash with an error message that was less
# helpful than the warning. The new implementation makes it
# more clear that html.parser just can't parse this
# markup. The 3.10 implementation does the same, though it
# raises AssertionError rather than calling a method. (We
# catch this error and wrap it in a ParserRejectedMarkup.)
raise ParserRejectedMarkup(message)

def handle_startendtag(self, name, attrs):
"""Handle an incoming empty-element tag.
Expand Down Expand Up @@ -359,6 +376,12 @@ def feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
parser.feed(markup)
try:
parser.feed(markup)
except AssertionError as e:
# html.parser raises AssertionError in rare cases to
# indicate a fatal problem with the markup, especially
# when there's an error in the doctype declaration.
raise ParserRejectedMarkup(e)
parser.close()
parser.already_closed_empty_element = []
Loading

0 comments on commit 0f15b53

Please sign in to comment.