Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add article interlinks to the output of gensim.scripts.segment_wiki. Fix #1712 #1839

Merged
merged 23 commits into from
Jan 31, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
f90cd9c
promoting the markup gives up information needed to find the intelinks
steremma Jan 13, 2018
cdfb26a
Add interlinks to the output of `segment_wiki`
steremma Jan 13, 2018
acc5221
Fixed PEP 8
steremma Jan 13, 2018
0057c7b
Refactoring identation and variable names
steremma Jan 15, 2018
107d7f7
Removed debugging code from script
steremma Jan 15, 2018
4adcf86
Fixed a bug where interlinks with a description or multiple names whe…
steremma Jan 15, 2018
9bf6b87
Now stripping whitespace off section titles
steremma Jan 15, 2018
931e138
Unit test `gensim.scripts.segment_wiki`
steremma Jan 15, 2018
cd37315
Fix Python 3.5 compatibility
steremma Jan 15, 2018
c681a60
Section text now completely clean from wiki markup
steremma Jan 16, 2018
ead5386
Added extra logging info to troublehsoot weird Travis behavior
steremma Jan 16, 2018
193861c
Fix PEP 8
steremma Jan 16, 2018
e170c06
pin workers for segment_and_write_all_articles
menshikh-iv Jan 16, 2018
b68507b
Merge branch 'interlinks' of https://github.com/steremma/gensim into …
steremma Jan 16, 2018
0884f6d
Get rid of debugging stuff
steremma Jan 16, 2018
58f63ca
Get rid of global logger
steremma Jan 16, 2018
7682f30
Interlinks are now mapping from the linked article's title to the act…
steremma Jan 20, 2018
3b13d3b
Moved regex outside function
steremma Jan 20, 2018
e038f52
Interlink extraction is now optional and controlled with the `-i` com…
steremma Jan 25, 2018
68ca8b1
Merge branch 'develop' of https://github.com/RaRe-Technologies/gensim…
steremma Jan 25, 2018
94c2b3d
PEP 8 long lines
steremma Jan 25, 2018
3c838a6
made scripts tests aware of the optional interlinks argument
steremma Jan 25, 2018
7f9ed71
Updated script help output for interlinks
steremma Jan 30, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,6 @@ def remove_markup(text):
if old == text or iters > 2:
break

# the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists'
# TODO is this really desirable?
text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text
return text


Expand Down
57 changes: 42 additions & 15 deletions gensim/scripts/segment_wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,27 +70,28 @@ def segment_all_articles(file_path, min_article_character=200, workers=None):

Yields
------
(str, list of (str, str))
Structure contains (title, [(section_heading, section_content), ...]).
(str, list of (str, str), list of str)
Structure contains (title, [(section_heading, section_content), ...], [interlink, ...]).

"""
with smart_open(file_path, 'rb') as xml_fileobj:
wiki_sections_corpus = _WikiSectionsCorpus(
xml_fileobj, min_article_character=min_article_character, processes=workers)
wiki_sections_corpus.metadata = True
wiki_sections_text = wiki_sections_corpus.get_texts_with_sections()
for article_title, article_sections in wiki_sections_text:
yield article_title, article_sections
for article_title, article_sections, article_interlinks in wiki_sections_text:
yield article_title, article_sections, article_interlinks


def segment_and_write_all_articles(file_path, output_file, min_article_character=200, workers=None):
"""Write article title and sections to `output_file` (or stdout, if output_file is None).

The output format is one article per line, in json-line format with 3 fields::
The output format is one article per line, in json-line format with 4 fields::

'title' - title of article,
'section_titles' - list of titles of sections,
'section_texts' - list of content from sections.
'section_texts' - list of content from sections,
'section_interlinks' - list of interlinks in the article.

Parameters
----------
Expand All @@ -115,8 +116,13 @@ def segment_and_write_all_articles(file_path, output_file, min_article_character

try:
article_stream = segment_all_articles(file_path, min_article_character, workers=workers)
for idx, (article_title, article_sections) in enumerate(article_stream):
output_data = {"title": article_title, "section_titles": [], "section_texts": []}
for idx, (article_title, article_sections, article_interlinks) in enumerate(article_stream):
output_data = {"title": article_title,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please use hanging indents (instead of vertical)

"section_titles": [],
"section_texts": [],
"section_interlinks": article_interlinks
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you don't split interlinks by sections, this should name as "interlinks" instead of "section_interlinks".

}

for section_heading, section_content in article_sections:
output_data["section_titles"].append(section_heading)
output_data["section_texts"].append(section_content)
Expand Down Expand Up @@ -171,9 +177,10 @@ def segment(page_xml):
Content from page tag.

Returns

-------
(str, list of (str, str))
Structure contains (title, [(section_heading, section_content)]).
(str, list of (str, str), list of str)
Structure contains (title, [(section_heading, section_content), ...], [interlink, ...]).

"""
elem = cElementTree.fromstring(page_xml)
Expand All @@ -186,6 +193,7 @@ def segment(page_xml):
lead_section_heading = "Introduction"
top_level_heading_regex = r"\n==[^=].*[^=]==\n"
top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"
interlink_regex_capture = r"\[\[(.*?)\]\]"

title = elem.find(title_path).text
text = elem.find(text_path).text
Expand All @@ -203,7 +211,14 @@ def segment(page_xml):

section_contents = [filter_wiki(section_content) for section_content in section_contents]
sections = list(zip(section_headings, section_contents))
return title, sections

interlinks = []
for filtered_content in section_contents:
section_interlinks = re.findall(interlink_regex_capture, filtered_content)
legit_interlinks = [i for i in section_interlinks if '[' not in i and ']' not in i]
interlinks.extend(legit_interlinks)

return title, sections, interlinks


class _WikiSectionsCorpus(WikiCorpus):
Expand Down Expand Up @@ -256,8 +271,8 @@ def get_texts_with_sections(self):

Yields
------
(str, list of (str, str))
Structure contains (title, [(section_heading, section_content), ...]).
(str, list of (str, str), list of str)
Structure contains (title, [(section_heading, section_content), ...], [interlink, ...]).

"""
skipped_namespace, skipped_length, skipped_redirect = 0, 0, 0
Expand All @@ -267,7 +282,7 @@ def get_texts_with_sections(self):
# process the corpus in smaller chunks of docs, because multiprocessing.Pool
# is dumb and would load the entire input into RAM at once...
for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1):
for article_title, sections in pool.imap(segment, group): # chunksize=10):
for article_title, sections, interlinks in pool.imap(segment, group): # chunksize=10):
# article redirects are pruned here
if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): # filter non-articles
skipped_namespace += 1
Expand All @@ -282,7 +297,7 @@ def get_texts_with_sections(self):

total_articles += 1
total_sections += len(sections)
yield (article_title, sections)
yield (article_title, sections, interlinks)
logger.info(
"finished processing %i articles with %i sections (skipped %i redirects, %i stubs, %i ignored namespaces)",
total_articles, total_sections, skipped_redirect, skipped_length, skipped_namespace)
Expand Down Expand Up @@ -321,3 +336,15 @@ def get_texts_with_sections(self):
)

logger.info("finished running %s", sys.argv[0])

print("-----Now checking output--------\n\n\n")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why? This isn't needed here.

for line in smart_open(args.output):
# decode each JSON line into a Python dictionary object
article = json.loads(line)

# each article has a "title" and a list of "section_titles" and "section_texts".
print("Article title: %s" % article['title'])
print("Article interlinks: %s" % article['section_interlinks'])
for section_title, section_text in zip(article['section_titles'], article['section_texts']):
print("Section title: %s" % section_title)
print("Section text: %s" % section_text)