Skip to content

Commit

Permalink
misc: improvement from review
Browse files Browse the repository at this point in the history
  • Loading branch information
HLasse committed Dec 15, 2022
1 parent 153764b commit b75c219
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 52 deletions.
46 changes: 30 additions & 16 deletions textdescriptives/components/coherence.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,24 +57,38 @@ def coherence(self, doc: Doc) -> None:
},
)

def _first_order_coherence(self, doc: Doc) -> List[float]:
@staticmethod
def _first_order_coherence(doc: Doc) -> List[float]:
"""Calculate first order coherence for a `Doc`, i.e. the semantic similarity
between consecutive sentences."""
return self._n_order_coherence(doc=doc, order=1)
return n_order_coherence(doc=doc, order=1)

def _second_order_coherence(self, doc: Doc) -> List[float]:
@staticmethod
def _second_order_coherence(doc: Doc) -> List[float]:
"""Calculate second order coherence for a `Doc`, i.e. the semantic similarity
between sentences that are two sentences apart."""
return self._n_order_coherence(doc, order=2)

def _n_order_coherence(self, doc: Doc, order: int):
"""Calculate coherence for a `Doc` for a given order."""
sents = list(doc.sents)
if len(sents) < order + 1:
return np.nan
similarities: List[float] = []
for i, sent in enumerate(sents):
if i == len(sents) - order:
break
similarities.append(sent.similarity(sents[i + order]))
return similarities
return n_order_coherence(doc, order=2)


def n_order_coherence(doc: Doc, order: int) -> List[float]:
"""Calculate coherence for a `Doc` for a given order.
Args:
doc: A `Doc` object.
order: The order of coherence to calculate. For example, order=1 will
calculate the semantic similarity between consecutive sentences. And
order=2 will calculate the semantic similarity between sentences that
are two sentences apart.
Returns:
A list of floats representing the semantic similarity between sentences
"""
sents = list(doc.sents)
if len(sents) < order + 1:
return [np.nan]
similarities: List[float] = []
for i, sent in enumerate(sents):
if i == len(sents) - order:
break
similarities.append(sent.similarity(sents[i + order]))
return similarities
48 changes: 12 additions & 36 deletions textdescriptives/tests/test_coherence.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,35 +6,18 @@


@pytest.fixture(scope="function")
def nlp_small():
def nlp():
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textdescriptives.coherence")
return nlp


@pytest.fixture(scope="function")
def nlp_large():
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("textdescriptives.coherence")
return nlp


def test_coherence_integration(nlp_small):
assert "textdescriptives.coherence" == nlp_small.pipe_names[-1]


def test_coherence_small_model(nlp_small):
doc = nlp_small(
"This is a short and simple sentence. Here is yet another one. We need quite a few before these coherences metrics make sense. Rambling, on and on."
)
def test_coherence_integration(nlp):
assert "textdescriptives.coherence" == nlp.pipe_names[-1]

assert doc._.coherence
assert doc._.first_order_coherence_values
assert doc._.second_order_coherence_values


def test_coherence_large_model(nlp_large):
doc = nlp_large(
def test_coherence(nlp):
doc = nlp(
"This is a short and simple sentence. Here is yet another one. We need quite a few before these coherences metrics make sense. Rambling, on and on."
)

Expand All @@ -43,25 +26,18 @@ def test_coherence_large_model(nlp_large):
assert doc._.second_order_coherence_values


def test_coherence_small_model_single_sentence(nlp_small):
doc = nlp_small("This is a short and simple sentence.")

assert np.isnan(doc._.first_order_coherence_values).all()
assert np.isnan(doc._.second_order_coherence_values).all()


def test_coherence_large_model_single_sentence(nlp_large):
doc = nlp_large("This is a short and simple sentence.")
def test_coherence_single_sentence(nlp):
doc = nlp("This is a short and simple sentence.")

assert np.isnan(doc._.first_order_coherence_values).all()
assert np.isnan(doc._.second_order_coherence_values).all()


def test_coherence_difference(nlp_large):
coherent_doc = nlp_large(
def test_coherence_difference(nlp):
coherent_doc = nlp(
"We will now talk about animals. Dogs are animals. Cats are animals. Birds are animals. Fish are animals."
)
incoherent_doc = nlp_large(
incoherent_doc = nlp(
"Let's talk about a bunch of things. Houses made of pancakes and dogs talking like humans. Look, the snow is falling."
)
assert (
Expand All @@ -74,8 +50,8 @@ def test_coherence_difference(nlp_large):
)


def test_coherence_multi_process(nlp_small):
docs = nlp_small.pipe(
def test_coherence_multi_process(nlp):
docs = nlp.pipe(
[
"This is a short and simple sentence. Here is yet another one. We need quite a few before these coherences metrics make sense. Rambling, on and on.",
"And another one. That's it. No more.",
Expand Down

0 comments on commit b75c219

Please sign in to comment.