Skip to content

Commit

Permalink
Merge pull request #2777 from tirkarthi/fix-xml
Browse files Browse the repository at this point in the history
cElementTree has been deprecated since Python 3.3 and removed in Python 3.9
  • Loading branch information
piskvorky authored Apr 24, 2020
2 parents 47357de + 585b0c0 commit 996801b
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 6 deletions.
8 changes: 6 additions & 2 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,12 @@
import re
import signal
from pickle import PicklingError
from xml.etree.cElementTree import \
iterparse # LXML isn't faster, so let's go with the built-in solution
# LXML isn't faster, so let's go with the built-in solution
try:
from xml.etree.cElementTree import iterparse
except ImportError:
from xml.etree.ElementTree import iterparse


from gensim import utils
# cannot import whole gensim.corpora, because that imports wikicorpus...
Expand Down
11 changes: 7 additions & 4 deletions gensim/scripts/segment_wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,10 @@
import multiprocessing
import re
import sys
from xml.etree import cElementTree
try:
from xml.etree import cElementTree as ET
except ImportError:
from xml.etree import ElementTree as ET
from functools import partial

from gensim.corpora.wikicorpus import IGNORED_NAMESPACES, WikiCorpus, filter_wiki, find_interlinks, get_namespace, utils
Expand Down Expand Up @@ -183,7 +186,7 @@ def extract_page_xmls(f):
XML strings for page tags.
"""
elems = (elem for _, elem in cElementTree.iterparse(f, events=("end",)))
elems = (elem for _, elem in ET.iterparse(f, events=("end",)))

elem = next(elems)
namespace = get_namespace(elem.tag)
Expand All @@ -192,7 +195,7 @@ def extract_page_xmls(f):

for elem in elems:
if elem.tag == page_tag:
yield cElementTree.tostring(elem)
yield ET.tostring(elem)
# Prune the element tree, as per
# http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
# except that we don't need to prune backlinks from the parent
Expand Down Expand Up @@ -221,7 +224,7 @@ def segment(page_xml, include_interlinks=False):
(Optionally) [(interlink_article, interlink_text), ...]).
"""
elem = cElementTree.fromstring(page_xml)
elem = ET.fromstring(page_xml)
filter_namespaces = ('0',)
namespace = get_namespace(elem.tag)
ns_mapping = {"ns": namespace}
Expand Down

0 comments on commit 996801b

Please sign in to comment.