Skip to content

Commit

Permalink
[#230,#241][Xooxle/Index] Replace Whitespace with one Space Character.
Browse files Browse the repository at this point in the history
Newlines are problematic, because they get converted downstream (by the
Xooxle engine) to `<br>` tags.
  • Loading branch information
pishoyg committed Oct 9, 2024
1 parent 90d16eb commit 644012b
Showing 1 changed file with 1 addition and 16 deletions.
17 changes: 1 addition & 16 deletions site/xooxle.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import enum
import os
import re
import typing

import bs4
Expand Down Expand Up @@ -46,8 +45,6 @@
"hr",
}

_MULTIPLE_SPACES = re.compile(r"\s{2,}")


class selector:
def select(self, _: pd.Series | bs4.Tag) -> bs4.Tag | None:
Expand Down Expand Up @@ -155,23 +152,11 @@ def _clean_text(text: str) -> str:


def _clean_html(tag: bs4.Tag) -> str:
"""We delete all comments, and then replace any occurrence of multiple
whitespace characters with a single space.
The reason we do this instead
of simply this:
```
" ".join(s.split())
```
is to avoid replacement of non-space whitespace characters that are
on their own. In those cases, we retain the original character to minimize
the discrepancy between the output and the source.
"""
for comments in tag.findAll(
text=lambda text: isinstance(text, bs4.Comment),
):
comments.extract()
return _MULTIPLE_SPACES.sub(" ", str(tag))
return " ".join(str(tag).split())


class capture:
Expand Down

0 comments on commit 644012b

Please sign in to comment.