diff --git a/.gitignore b/.gitignore index fa92dac5..75958cf3 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,5 @@ dmypy.json # Mac stuff .DStore + +.vscode \ No newline at end of file diff --git a/README.md b/README.md index 853ba8ef..a63706b5 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,6 @@ The table below shows the metrics included in TextDescriptives and their attribu | `Doc._.readability` | `readability` | Dict containing Flesch Reading Ease, Flesch-Kincaid Grade, SMOG, Gunning-Fog, Automated Readability Index, Coleman-Liau Index, LIX, and RIX readability metrics for the Doc. | | `Doc._.dependency_distance` | `dependency_distance` | Dict containing the mean and standard deviation of the dependency distance and proportion adjacent dependency relations in the Doc.| `Span._.token_length` | `descriptive_stats` | Dict containing mean, median, and std of token length in the span. | -| `Span._.syllables` | `descriptive_stats` | Dict containing mean, median, and std of number of syllables per token in the span. | | `Span._.counts` | `descriptive_stats` | Dict containing the number of tokens, number of unique tokens, proportion unique tokens, and number of characters in the span. | | `Span._.dependency_distance` | `dependency_distance` | Dict containing the mean dependency distance and proportion adjacent dependency relations in the Doc.| | `Token._.dependency_distance` | `dependency_distance` | Dict containing the dependency distance and whether the head word is adjacent for a Token.| diff --git a/setup.py b/setup.py index 93d9f4ee..68a39e52 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ author="Lasse Hansen", author_email="lasseh0310@gmail.com", url="https://github.com/HLasse/textdescriptives", - packages=["spacy-textdescriptives"], + packages=["textdescriptives"], install_requires=[ "spacy>=3.0.3", "numpy>=1.20.0", diff --git a/textdescriptives/__init__.py b/textdescriptives/__init__.py index 895d63b2..a0017310 100644 --- a/textdescriptives/__init__.py +++ b/textdescriptives/__init__.py @@ -1,2 +1,3 @@ from .load_components import TextDescriptives -from .extractor import extract_df +from .components import DescriptiveStatistics, Readability, DependencyDistance +from .extractor import extract_df, readability_cols, dependency_cols, descriptive_stats_cols diff --git a/textdescriptives/components/__init__.py b/textdescriptives/components/__init__.py index fcd6059d..8fa301b2 100644 --- a/textdescriptives/components/__init__.py +++ b/textdescriptives/components/__init__.py @@ -1,3 +1,3 @@ -from .readability import create_readability_component -from .dependency_distance import create_dependency_distance_component -from .descriptive_stats import create_descriptive_stats_component +from .readability import Readability +from .dependency_distance import DependencyDistance +from .descriptive_stats import DescriptiveStatistics diff --git a/textdescriptives/components/dependency_distance.py b/textdescriptives/components/dependency_distance.py index 62015a5b..8a5dc5a4 100644 --- a/textdescriptives/components/dependency_distance.py +++ b/textdescriptives/components/dependency_distance.py @@ -46,6 +46,13 @@ def span_dependency(self, span: Span): def doc_dependency(self, doc: Doc): """Doc-level dependency distance aggregated on sentence level""" + if len(doc) == 0: + return { + "dependency_distance_mean": np.nan, + "dependency_distance_std": np.nan, + "prop_adjacent_dependency_relation_mean": np.nan, + "prop_adjacent_dependency_relation_std": np.nan, + } dep_dists, adj_deps = zip( *[sent._.dependency_distance.values() for sent in doc.sents] ) diff --git a/textdescriptives/components/descriptive_stats.py b/textdescriptives/components/descriptive_stats.py index 585bdbcf..859ad5f0 100644 --- a/textdescriptives/components/descriptive_stats.py +++ b/textdescriptives/components/descriptive_stats.py @@ -9,6 +9,9 @@ @Language.factory("descriptive_stats") def create_descriptive_stats_component(nlp: Language, name: str): + sentencizers = set(["sentencizer", "parser"]) + if not sentencizers.intersection(set(nlp.pipe_names)): + nlp.add_pipe("sentencizer") # add a sentencizer if not one in pipe return DescriptiveStatistics(nlp) @@ -35,7 +38,7 @@ def __init__(self, nlp: Language): self.counts, ] for ext, fun in zip(extensions, ext_funs): - if ext not in ["_n_sentences", "sentence_length"]: + if ext not in ["_n_sentences", "sentence_length", "syllables"]: if not Span.has_extension(ext): Span.set_extension(ext, getter=fun) if not Doc.has_extension(ext): @@ -60,7 +63,7 @@ def token_length(self, doc: Union[Doc, Span]): "token_length_std": np.std(token_lengths), } - def sentence_length(self, doc: Union[Doc, Span]): + def sentence_length(self, doc: Doc): """Return dict with measures of sentence length""" # get length of filtered tokens per sentence tokenized_sentences = [ @@ -78,7 +81,7 @@ def sentence_length(self, doc: Union[Doc, Span]): "sentence_length_std": np.std(len_sentences), } - def syllables(self, doc: Union[Doc, Span]): + def syllables(self, doc: Doc): """Return dict with measures of syllables per token""" n_syllables = doc._._n_syllables return { @@ -95,12 +98,16 @@ def counts(self, doc: Union[Doc, Span], ignore_whitespace: bool = True): else: n_chars = len(doc.text) + if n_tokens == 0: + prop_unique_tokens = np.nan + else: + prop_unique_tokens = n_types / n_tokens out = { "n_tokens": n_tokens, "n_unique_tokens": n_types, - "percent_unique_tokens": n_types / n_tokens, + "proportion_unique_tokens": prop_unique_tokens, "n_characters": n_chars, } if type(doc) == Doc: out["n_sentences"] = doc._._n_sentences - return out + return out \ No newline at end of file diff --git a/textdescriptives/components/readability.py b/textdescriptives/components/readability.py index f6315855..9e85d435 100644 --- a/textdescriptives/components/readability.py +++ b/textdescriptives/components/readability.py @@ -1,7 +1,10 @@ """Calculation of various readability metrics""" +from textdescriptives.components.utils import n_sentences from spacy.tokens import Doc from spacy.language import Language +import numpy as np + from .descriptive_stats import create_descriptive_stats_component @@ -47,22 +50,22 @@ def _flesch_reading_ease(self, doc: Doc): Higher = easier to read Works best for English """ - score = ( - 206.835 - - (1.015 * doc._.sentence_length["sentence_length_mean"]) - - (84.6 * doc._.syllables["syllables_per_token_mean"]) - ) + avg_sentence_length = doc._.sentence_length["sentence_length_mean"] + avg_syl_per_word = doc._.syllables["syllables_per_token_mean"] + if avg_sentence_length == 0 or avg_syl_per_word == 0: + return np.nan + score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syl_per_word) return score def _flesch_kincaid_grade(self, doc: Doc): """ Score = grade required to read the text """ - score = ( - 0.39 * doc._.sentence_length["sentence_length_mean"] - + 11.8 * doc._.syllables["syllables_per_token_mean"] - - 15.59 - ) + avg_sentence_length = doc._.sentence_length["sentence_length_mean"] + avg_syl_per_word = doc._.syllables["syllables_per_token_mean"] + if avg_sentence_length == 0 or avg_syl_per_word == 0: + return np.nan + score = 0.39 * avg_sentence_length + 11.8 * avg_syl_per_word - 15.59 return score def _smog(self, doc: Doc, hard_words: int): @@ -70,25 +73,31 @@ def _smog(self, doc: Doc, hard_words: int): grade level = 1.043( sqrt(30 * (hard words /n sentences)) + 3.1291 Preferably need 30+ sentences. Will not work with less than 4 """ - if doc._._n_sentences >= 3: - smog = (1.043 * (30 * (hard_words / doc._._n_sentences)) ** 0.5) + 3.1291 + n_sentences = doc._._n_sentences + if n_sentences >= 3: + smog = (1.043 * (30 * (hard_words / n_sentences)) ** 0.5) + 3.1291 return smog else: - return 0.0 + return np.nan def _gunning_fog(self, doc, hard_words: int): """ Grade level = 0.4 * ((avg_sentence_length) + (percentage hard words)) hard words = 3+ syllables """ + n_tokens = doc._._n_tokens + if n_tokens == 0: + return np.nan avg_sent_len = doc._.sentence_length["sentence_length_mean"] - percent_hard_words = (hard_words / doc._._n_tokens) * 100 + percent_hard_words = (hard_words / n_tokens) * 100 return 0.4 * (avg_sent_len + percent_hard_words) def _automated_readability_index(self, doc: Doc): """ Score = grade required to read the text """ + if len(doc) == 0: + return np.nan score = ( 4.71 * doc._.token_length["token_length_mean"] + 0.5 * doc._.sentence_length["sentence_length_mean"] @@ -102,17 +111,26 @@ def _coleman_liau_index(self, doc: Doc): 0.296 * avg num of sents pr 100 words -15.8 Score = grade required to read the text """ + n_tokens = doc._._n_tokens + if n_tokens == 0: + return np.nan l = doc._.token_length["token_length_mean"] * 100 - s = (doc._._n_sentences / doc._.sentence_length["sentence_length_mean"]) * 100 + s = (doc._._n_sentences / n_tokens) * 100 return 0.0588 * l - 0.296 * s - 15.8 def _lix(self, doc: Doc, long_words: int): """ (n_words / n_sentences) + (n_words longer than 6 letters * 100) / n_words """ - percent_long_words = long_words / doc._._n_tokens * 100 + n_tokens = doc._._n_tokens + if n_tokens == 0: + return np.nan + percent_long_words = long_words / n_tokens * 100 return doc._.sentence_length["sentence_length_mean"] + percent_long_words def _rix(self, doc: Doc, long_words: int): """n_long_words / n_sentences""" - return long_words / doc._._n_sentences + n_sentences = doc._._n_sentences + if n_sentences == 0: + return np.nan + return long_words / n_sentences diff --git a/textdescriptives/components/utils.py b/textdescriptives/components/utils.py index 5228e64a..5dbe9730 100644 --- a/textdescriptives/components/utils.py +++ b/textdescriptives/components/utils.py @@ -24,10 +24,11 @@ def n_tokens(doc: Union[Doc, Span]): return len(doc._._filtered_tokens) -def n_syllables(doc: Union[Doc, Span]): +def n_syllables(doc: Doc): """ Return number of syllables per token """ + dic = Pyphen(lang=doc.lang_) def count_syl(token: Token): diff --git a/textdescriptives/extractor.py b/textdescriptives/extractor.py index b89cb3d3..6fe191f8 100644 --- a/textdescriptives/extractor.py +++ b/textdescriptives/extractor.py @@ -135,7 +135,7 @@ def extract_df( "syllables_per_token_std", "n_tokens", "n_unique_tokens", - "percent_unique_tokens", + "proportion_unique_tokens", "n_sentences", "n_characters", ] diff --git a/textdescriptives/load_components.py b/textdescriptives/load_components.py index cf016165..3536f34c 100644 --- a/textdescriptives/load_components.py +++ b/textdescriptives/load_components.py @@ -1,8 +1,8 @@ """Adds all components to a spaCy pipeline""" -from components import ( - create_readability_component, - create_dependency_distance_component, - create_descriptive_stats_component, +from .components import ( + Readability, + DependencyDistance, + DescriptiveStatistics, ) from spacy.language import Language diff --git a/textdescriptives/subsetters.py b/textdescriptives/tests/__init__.py similarity index 100% rename from textdescriptives/subsetters.py rename to textdescriptives/tests/__init__.py diff --git a/textdescriptives/tests/books.py b/textdescriptives/tests/books.py new file mode 100644 index 00000000..324188a8 --- /dev/null +++ b/textdescriptives/tests/books.py @@ -0,0 +1,511 @@ +"""These books and tests are borrowed from https://github.com/mholtzscher/spacy_readability""" + +oliver_twist = """Among other public buildings in a certain town, which for many reasons +it will be prudent to refrain from mentioning, and to which I will +assign no fictitious name, there is one anciently common to most towns, +great or small: to wit, a workhouse; and in this workhouse was born; on +a day and date which I need not trouble myself to repeat, inasmuch as +it can be of no possible consequence to the reader, in this stage of +the business at all events; the item of mortality whose name is +prefixed to the head of this chapter. + +For a long time after it was ushered into this world of sorrow and +trouble, by the parish surgeon, it remained a matter of considerable +doubt whether the child would survive to bear any name at all; in which +case it is somewhat more than probable that these memoirs would never +have appeared; or, if they had, that being comprised within a couple of +pages, they would have possessed the inestimable merit of being the +most concise and faithful specimen of biography, extant in the +literature of any age or country. + +Although I am not disposed to maintain that the being born in a +workhouse, is in itself the most fortunate and enviable circumstance +that can possibly befall a human being, I do mean to say that in this +particular instance, it was the best thing for Oliver Twist that could +by possibility have occurred. The fact is, that there was considerable +difficulty in inducing Oliver to take upon himself the office of +respiration,--a troublesome practice, but one which custom has rendered +necessary to our easy existence; and for some time he lay gasping on a +little flock mattress, rather unequally poised between this world and +the next: the balance being decidedly in favour of the latter. Now, +if, during this brief period, Oliver had been surrounded by careful +grandmothers, anxious aunts, experienced nurses, and doctors of +profound wisdom, he would most inevitably and indubitably have been +killed in no time. There being nobody by, however, but a pauper old +woman, who was rendered rather misty by an unwonted allowance of beer; +and a parish surgeon who did such matters by contract; Oliver and +Nature fought out the point between them. The result was, that, after +a few struggles, Oliver breathed, sneezed, and proceeded to advertise +to the inmates of the workhouse the fact of a new burden having been +imposed upon the parish, by setting up as loud a cry as could +reasonably have been expected from a male infant who had not been +possessed of that very useful appendage, a voice, for a much longer +space of time than three minutes and a quarter. + +As Oliver gave this first proof of the free and proper action of his +lungs, the patchwork coverlet which was carelessly flung over the iron +bedstead, rustled; the pale face of a young woman was raised feebly +from the pillow; and a faint voice imperfectly articulated the words, +'Let me see the child, and die.' + +The surgeon had been sitting with his face turned towards the fire: +giving the palms of his hands a warm and a rub alternately. As the +young woman spoke, he rose, and advancing to the bed's head, said, with +more kindness than might have been expected of him: + +'Oh, you must not talk about dying yet.' + +'Lor bless her dear heart, no!' interposed the nurse, hastily +depositing in her pocket a green glass bottle, the contents of which +she had been tasting in a corner with evident satisfaction. + +'Lor bless her dear heart, when she has lived as long as I have, sir, +and had thirteen children of her own, and all on 'em dead except two, +and them in the wurkus with me, she'll know better than to take on in +that way, bless her dear heart! Think what it is to be a mother, +there's a dear young lamb do.' + +Apparently this consolatory perspective of a mother's prospects failed +in producing its due effect. The patient shook her head, and stretched +out her hand towards the child. + +The surgeon deposited it in her arms. She imprinted her cold white +lips passionately on its forehead; passed her hands over her face; +gazed wildly round; shuddered; fell back--and died. They chafed her +breast, hands, and temples; but the blood had stopped forever. They +talked of hope and comfort. They had been strangers too long. + +'It's all over, Mrs. Thingummy!' said the surgeon at last. + +'Ah, poor dear, so it is!' said the nurse, picking up the cork of the +green bottle, which had fallen out on the pillow, as she stooped to +take up the child. 'Poor dear!' + +'You needn't mind sending up to me, if the child cries, nurse,' said +the surgeon, putting on his gloves with great deliberation. 'It's very +likely it _will_ be troublesome. Give it a little gruel if it is.' He +put on his hat, and, pausing by the bed-side on his way to the door, +added, 'She was a good-looking girl, too; where did she come from?' + +'She was brought here last night,' replied the old woman, 'by the +overseer's order. She was found lying in the street. She had walked +some distance, for her shoes were worn to pieces; but where she came +from, or where she was going to, nobody knows.' + +The surgeon leaned over the body, and raised the left hand. 'The old +story,' he said, shaking his head: 'no wedding-ring, I see. Ah! +Good-night!' + +The medical gentleman walked away to dinner; and the nurse, having once +more applied herself to the green bottle, sat down on a low chair +before the fire, and proceeded to dress the infant. + +What an excellent example of the power of dress, young Oliver Twist +was! Wrapped in the blanket which had hitherto formed his only +covering, he might have been the child of a nobleman or a beggar; it +would have been hard for the haughtiest stranger to have assigned him +his proper station in society. But now that he was enveloped in the +old calico robes which had grown yellow in the same service, he was +badged and ticketed, and fell into his place at once--a parish +child--the orphan of a workhouse--the humble, half-starved drudge--to +be cuffed and buffeted through the world--despised by all, and pitied +by none. + +Oliver cried lustily. If he could have known that he was an orphan, +left to the tender mercies of church-wardens and overseers, perhaps he +would have cried the louder.""" + +secret_garden = """ +When Mary Lennox was sent to Misselthwaite Manor to live with her uncle +everybody said she was the most disagreeable-looking child ever seen. +It was true, too. She had a little thin face and a little thin body, +thin light hair and a sour expression. Her hair was yellow, and her +face was yellow because she had been born in India and had always been +ill in one way or another. Her father had held a position under the +English Government and had always been busy and ill himself, and her +mother had been a great beauty who cared only to go to parties and +amuse herself with gay people. She had not wanted a little girl at +all, and when Mary was born she handed her over to the care of an Ayah, +who was made to understand that if she wished to please the Mem Sahib +she must keep the child out of sight as much as possible. So when she +was a sickly, fretful, ugly little baby she was kept out of the way, +and when she became a sickly, fretful, toddling thing she was kept out +of the way also. She never remembered seeing familiarly anything but +the dark faces of her Ayah and the other native servants, and as they +always obeyed her and gave her her own way in everything, because the +Mem Sahib would be angry if she was disturbed by her crying, by the +time she was six years old she was as tyrannical and selfish a little +pig as ever lived. The young English governess who came to teach her +to read and write disliked her so much that she gave up her place in +three months, and when other governesses came to try to fill it they +always went away in a shorter time than the first one. So if Mary had +not chosen to really want to know how to read books she would never +have learned her letters at all. + +One frightfully hot morning, when she was about nine years old, she +awakened feeling very cross, and she became crosser still when she saw +that the servant who stood by her bedside was not her Ayah. + +"Why did you come?" she said to the strange woman. "I will not let you +stay. Send my Ayah to me." + +The woman looked frightened, but she only stammered that the Ayah could +not come and when Mary threw herself into a passion and beat and kicked +her, she looked only more frightened and repeated that it was not +possible for the Ayah to come to Missie Sahib. + +There was something mysterious in the air that morning. Nothing was +done in its regular order and several of the native servants seemed +missing, while those whom Mary saw slunk or hurried about with ashy and +scared faces. But no one would tell her anything and her Ayah did not +come. She was actually left alone as the morning went on, and at last +she wandered out into the garden and began to play by herself under a +tree near the veranda. She pretended that she was making a flower-bed, +and she stuck big scarlet hibiscus blossoms into little heaps of earth, +all the time growing more and more angry and muttering to herself the +things she would say and the names she would call Saidie when she +returned. + +"Pig! Pig! Daughter of Pigs!" she said, because to call a native a pig +is the worst insult of all. + +She was grinding her teeth and saying this over and over again when she +heard her mother come out on the veranda with some one. She was with a +fair young man and they stood talking together in low strange voices. +Mary knew the fair young man who looked like a boy. She had heard that +he was a very young officer who had just come from England. The child +stared at him, but she stared most at her mother. She always did this +when she had a chance to see her, because the Mem Sahib--Mary used to +call her that oftener than anything else--was such a tall, slim, pretty +person and wore such lovely clothes. Her hair was like curly silk and +she had a delicate little nose which seemed to be disdaining things, +and she had large laughing eyes. All her clothes were thin and +floating, and Mary said they were "full of lace." They looked fuller of +lace than ever this morning, but her eyes were not laughing at all. +They were large and scared and lifted imploringly to the fair boy +officer's face. + +"Is it so very bad? Oh, is it?" Mary heard her say. + +"Awfully," the young man answered in a trembling voice. "Awfully, Mrs. +Lennox. You ought to have gone to the hills two weeks ago." + +The Mem Sahib wrung her hands. + +"Oh, I know I ought!" she cried. "I only stayed to go to that silly +dinner party. What a fool I was!" + +At that very moment such a loud sound of wailing broke out from the +servants' quarters that she clutched the young man's arm, and Mary +stood shivering from head to foot. The wailing grew wilder and wilder. +"What is it? What is it?" Mrs. Lennox gasped. + +"Some one has died," answered the boy officer. "You did not say it had +broken out among your servants." + +"I did not know!" the Mem Sahib cried. "Come with me! Come with me!" +and she turned and ran into the house. + +After that, appalling things happened, and the mysteriousness of the +morning was explained to Mary. The cholera had broken out in its most +fatal form and people were dying like flies. The Ayah had been taken +ill in the night, and it was because she had just died that the +servants had wailed in the huts. Before the next day three other +servants were dead and others had run away in terror. There was panic +on every side, and dying people in all the bungalows. + +During the confusion and bewilderment of the second day Mary hid +herself in the nursery and was forgotten by everyone. Nobody thought +of her, nobody wanted her, and strange things happened of which she +knew nothing. Mary alternately cried and slept through the hours. She +only knew that people were ill and that she heard mysterious and +frightening sounds. Once she crept into the dining-room and found it +empty, though a partly finished meal was on the table and chairs and +plates looked as if they had been hastily pushed back when the diners +rose suddenly for some reason. The child ate some fruit and biscuits, +and being thirsty she drank a glass of wine which stood nearly filled. +It was sweet, and she did not know how strong it was. Very soon it +made her intensely drowsy, and she went back to her nursery and shut +herself in again, frightened by cries she heard in the huts and by the +hurrying sound of feet. The wine made her so sleepy that she could +scarcely keep her eyes open and she lay down on her bed and knew +nothing more for a long time. + +Many things happened during the hours in which she slept so heavily, +but she was not disturbed by the wails and the sound of things being +carried in and out of the bungalow. + +When she awakened she lay and stared at the wall. The house was +perfectly still. She had never known it to be so silent before. She +heard neither voices nor footsteps, and wondered if everybody had got +well of the cholera and all the trouble was over. She wondered also +who would take care of her now her Ayah was dead. There would be a new +Ayah, and perhaps she would know some new stories. Mary had been +rather tired of the old ones. She did not cry because her nurse had +died. She was not an affectionate child and had never cared much for +any one. The noise and hurrying about and wailing over the cholera had +frightened her, and she had been angry because no one seemed to +remember that she was alive. Everyone was too panic-stricken to think +of a little girl no one was fond of. When people had the cholera it +seemed that they remembered nothing but themselves. But if everyone +had got well again, surely some one would remember and come to look for +her. + +But no one came, and as she lay waiting the house seemed to grow more +and more silent. She heard something rustling on the matting and when +she looked down she saw a little snake gliding along and watching her +with eyes like jewels. She was not frightened, because he was a +harmless little thing who would not hurt her and he seemed in a hurry +to get out of the room. He slipped under the door as she watched him. + +"How queer and quiet it is," she said. "It sounds as if there were no +one in the bungalow but me and the snake." + +Almost the next minute she heard footsteps in the compound, and then on +the veranda. They were men's footsteps, and the men entered the +bungalow and talked in low voices. No one went to meet or speak to +them and they seemed to open doors and look into rooms. "What +desolation!" she heard one voice say. "That pretty, pretty woman! I +suppose the child, too. I heard there was a child, though no one ever +saw her." + +Mary was standing in the middle of the nursery when they opened the +door a few minutes later. She looked an ugly, cross little thing and +was frowning because she was beginning to be hungry and feel +disgracefully neglected. The first man who came in was a large officer +she had once seen talking to her father. He looked tired and troubled, +but when he saw her he was so startled that he almost jumped back. + +"Barney!" he cried out. "There is a child here! A child alone! In a +place like this! Mercy on us, who is she!" + +"I am Mary Lennox," the little girl said, drawing herself up stiffly. +She thought the man was very rude to call her father's bungalow "A +place like this!" "I fell asleep when everyone had the cholera and I +have only just wakened up. Why does nobody come?" + +"It is the child no one ever saw!" exclaimed the man, turning to his +companions. "She has actually been forgotten!" + +"Why was I forgotten?" Mary said, stamping her foot. "Why does nobody +come?" + +The young man whose name was Barney looked at her very sadly. Mary +even thought she saw him wink his eyes as if to wink tears away. + +"Poor little kid!" he said. "There is nobody left to come." + +It was in that strange and sudden way that Mary found out that she had +neither father nor mother left; that they had died and been carried +away in the night, and that the few native servants who had not died +also had left the house as quickly as they could get out of it, none of +them even remembering that there was a Missie Sahib. That was why the +place was so quiet. It was true that there was no one in the bungalow +but herself and the little rustling snake.""" + +flatland = """ +I call our world Flatland, not because we call it so, but to make its +nature clearer to you, my happy readers, who are privileged to live in +Space. + +Imagine a vast sheet of paper on which straight Lines, Triangles, +Squares, Pentagons, Hexagons, and other figures, instead of remaining +fixed in their places, move freely about, on or in the surface, but +without the power of rising above or sinking below it, very much like +shadows--only hard and with luminous edges--and you will then have a +pretty correct notion of my country and countrymen. Alas, a few years +ago, I should have said "my universe": but now my mind has been opened +to higher views of things. + +In such a country, you will perceive at once that it is impossible that +there should be anything of what you call a "solid" kind; but I dare +say you will suppose that we could at least distinguish by sight the +Triangles, Squares, and other figures, moving about as I have described +them. On the contrary, we could see nothing of the kind, not at least +so as to distinguish one figure from another. Nothing was visible, nor +could be visible, to us, except Straight Lines; and the necessity of +this I will speedily demonstrate. + +Place a penny on the middle of one of your tables in Space; and leaning +over it, look down upon it. It will appear a circle. + +But now, drawing back to the edge of the table, gradually lower your +eye (thus bringing yourself more and more into the condition of the +inhabitants of Flatland), and you will find the penny becoming more and +more oval to your view, and at last when you have placed your eye +exactly on the edge of the table (so that you are, as it were, actually +a Flatlander) the penny will then have ceased to appear oval at all, +and will have become, so far as you can see, a straight line. + +The same thing would happen if you were to treat in the same way a +Triangle, or Square, or any other figure cut out of pasteboard. As +soon as you look at it with your eye on the edge on the table, you will +find that it ceases to appear to you a figure, and that it becomes in +appearance a straight line. Take for example an equilateral +Triangle--who represents with us a Tradesman of the respectable class. +Fig. 1 represents the Tradesman as you would see him while you were +bending over him from above; figs. 2 and 3 represent the Tradesman, as +you would see him if your eye were close to the level, or all but on +the level of the table; and if your eye were quite on the level of the +table (and that is how we see him in Flatland) you would see nothing +but a straight line. + +When I was in Spaceland I heard that your sailors have very similar +experiences while they traverse your seas and discern some distant +island or coast lying on the horizon. The far-off land may have bays, +forelands, angles in and out to any number and extent; yet at a +distance you see none of these (unless indeed your sun shines bright +upon them revealing the projections and retirements by means of light +and shade), nothing but a grey unbroken line upon the water. + +Well, that is just what we see when one of our triangular or other +acquaintances comes toward us in Flatland. As there is neither sun +with us, nor any light of such a kind as to make shadows, we have none +of the helps to the sight that you have in Spaceland. If our friend +comes closer to us we see his line becomes larger; if he leaves us it +becomes smaller: but still he looks like a straight line; be he a +Triangle, Square, Pentagon, Hexagon, Circle, what you will--a straight +Line he looks and nothing else. + +You may perhaps ask how under these disadvantageous circumstances we +are able to distinguish our friends from one another: but the answer to +this very natural question will be more fitly and easily given when I +come to describe the inhabitants of Flatland. For the present let me +defer this subject, and say a word or two about the climate and houses +in our country.""" + +grade_1 = """A train! A train! +A train! A train! +Could you, would you, +On a train? +Not on a train! Not in a tree! +Not in a car! Sam! Let me be! +I would not, could not, in a box. +I could not, would not, with a fox. +I will not eat them with a mouse. +I will not eat them in a house. +I will not eat them here or there. +I will not eat them anywhere. +I do not like green eggs and ham. +I do not like them, Sam-I-am.""" + +grade_2 = """Jonathan pushed back the big iron pot and stood up. +There were no bears. But up the path came his father, carrying his gun. And with +him were Jonathan's Uncle James and his Uncle Samuel, his Uncle John and his +Uncle Peter. Jonathan had never in all his life been so glad to see the uncles. +"Jonathan!'" said his father, "what a fright you have given us! Where have you +been all this time?" +"Coming over Hemlock Mountain," said Jonathan in a small voice. And he ran +right into his father's arms.""" + +grade_3 = """For months I had been telling myself that I would never put the Magic Finger +upon anyone again—not after what happened to my teacher, old Mrs. Winter. +Poor old Mrs. Winter. +One day we were in class, and she was teaching us spelling. "Stand up," she said +to me, "and spell kat." +"That's an easy one," I said. "K-a-t." +"You are a stupid little girl!" Mrs. Winter said. +"I am not a stupid little girl!" I cried. "I am a very nice little girl!" +"Go and stand in the corner," Mrs. Winter said. +Then I got cross, and I saw red, and I put the Magic Finger on Mrs. Winter good +and strong, and almost at once... +Guess what? +Whiskers began growing out of her face! They were long black whiskers, just +like the little ones you see on a kat, only much bigger. And how fast they grew! +Before we had time to think, they were out to her ears!""" + +grade_4 = """The wheelbarrow picked up speed, so quickly that it sort of kicked up like a +whipped horse. I thought the handle was going to rip right out of my fingers. +"Hang on," I said. +"If I can," said Soup. +We were running now, full speed, smack down Sutter's Hill and heading full tilt +toward the party. Ahead of us, the giant pumpkin bounced around inside the bin +of the barrow. I felt like we'd stolen the moon. +"We're out of control!" yelled Soup. +"Turn it. Do anything, anything!" +"Can't." +The front door of the Baptist Church grew bigger and bigger, rushing toward us +like a mad monster. My feet hardly touched the ground. I was too frightened to +hang on much longer, yet frightened even more to let loose. Soup was screaming +and so was I. +"Stop," wailed Soup. +From the street, there was one step up to the door of the Baptist Church. The +door was closed.""" + +grade_6 = """"Brothers. What do you expect of me—to stand idly by while you burn my son? +My son has brought death to none of us. The scratches he gave us are not on our +bodies but our pride. Brothers. How if my son is burnt do I go back and face her +who lives with me in my house? How do I look in the eyes of his sisters who +think the rainbow arches over him? Brothers. It is easier for me to fight you all +than go back and say that Cuyloga stood by and did nothing while his brothers +in anger put his son to the fire." +With the quickness of Long Tail, the panther, he took his knife and cut the boy's +thongs. Then he stood there waiting for the attack, but none came. The warriors +were too astonished. They watched, sullen and yet fascinated by the drama. This +was the great Cuyloga at his bravest that they looked upon, and none knew what +he would do next""" + +grade_8 = """All day Buck brooded by the pool or roamed restlessly about the camp. Death, +as a cessation of movement, as a passing out and away from the lives of the +living, he knew, and he knew John Thornton was dead. It left a great void in him +somewhat akin to hunger, but a void which ached and ached, and which food +could not fill. At times when he paused to contemplate the carcasses of the +Yeehats, he forgot the pain of it; and at such times he was aware of a great pride +in himself—a pride greater than any he had yet experienced. He had killed man, +the noblest game of all, and he had killed in the face of the law of club and fang. +He sniffed the bodies curiously. They had died so easily. I was harder to kill a +husky dog than them. They were no match at all, were it not for their arrows and +spears and clubs. Thenceforward he would be unafraid of them except when +they bore in their hands their arrows, spears, and clubs.""" + +grade_10 = """Looking upward, I surveyed the ceiling of my prison. It was some thirty or forty +feet overhead, and constructed much as the side walls. In one of its panels a very +singular figure riveted my whole attention. It was the painted figure of Time as +he is commonly represented, save that, in lieu of a scythe, he held what, at a +casual glance, I supposed to be the pictured image of a huge pendulum, such as +we see on antique clocks. There was something, however, in the appearance of +this machine which caused me to regard it more attentively. While I gazed +directly upward at it (for its position was immediately over my own) I fancied +that I saw it in motion. In an instant afterward the fancy was confirmed. Its +sweep was brief, and of course slow. I watched it for some minutes somewhat in +fear, but more in wonder. Wearied at length with observing its dull movement, I +turned my eyes upon the other objects in the cell. +A slight noise attracted my notice, and, looking to the floor, I saw several +enormous rats traversing it. They had issued from the wall which lay just within +view to my right.""" + +grade_12 = """For the rest he lived solitary, but not misanthropic, with his books and his +collection, classing and arranging specimens, corresponding with entomologists +in Europe, writing up a descriptive catalogue of his treasures. Such was the +history of the man whom I had come to consult upon Jim's case without any +definite hope. Simply to hear what he would have to say would have been a +relief. I was very anxious, but I respected the intense, almost passionate, +absorption with which he looked at a butterfly, as though on the bronze sheen of +these frail wings, in the white tracings, in the gorgeous markings, he could see +other things, an image of something as perishable and defying destruction as +these delicate and lifeless tissues displaying a splendour unmarked by death. +"Marvellous!" he repeated, looking up at me. "Look! The beauty—but that is +nothing—look at the accuracy, the harmony. And so fragile! And so strong! And +so exact! This is Nature—the balance of colossal forces. Every star is so—and +every blade of grass stands so—the mighty Kosmos in perfect equilibrium +produces—this. This wonder; this masterpiece of Nature—the great artist.\"""" + +grade_14 = """It would have been in consonance with the spirit of Captain Vere should +he on this occasion have concealed nothing from the condemned one; +should he indeed have frankly disclosed to him the part he himself had +played in bringing about the decision, at the same time revealing his +actuated motives. On Billy's side it is not improbable that such a +confession would have been received in much the same spirit that +prompted it. Not without a sort of joy indeed he might have appreciated +the brave opinion of him implied in his captain making such a confidant +of him. Nor as to the sentence itself could he have been insensible that it +was imparted to him as to one not afraid to die. Even more may have +Graded Text Samples +Impact Information. Page 9 +been. Captain Vere in the end may have developed the passion +sometimes latent under an exterior stoical or indifferent. He was old +enough to have been Billy's father. The austere devotee of military duty, +letting himself melt back into what remains primeval in our formalised +humanity, may in the end have caught Billy to his heart, even as +Abraham may have caught young Isaac on the brink of resolutely +offering him up in obedience to the exacting behest. """ diff --git a/textdescriptives/tests/test_dependency_distance.py b/textdescriptives/tests/test_dependency_distance.py new file mode 100644 index 00000000..308037af --- /dev/null +++ b/textdescriptives/tests/test_dependency_distance.py @@ -0,0 +1,103 @@ +import spacy +import pytest +from textdescriptives.components import DependencyDistance +from .books import * +import numpy as np +import ftfy + + +@pytest.fixture(scope="function") +def nlp(): + nlp = spacy.load("en_core_web_sm") + nlp.add_pipe("dependency_distance") + return nlp + + +def test_dependency_distance_integration(nlp): + assert "dependency_distance" == nlp.pipe_names[-1] + + +def test_dependency_distance(nlp): + doc = nlp("This is a short and simple sentence") + assert doc._.dependency_distance + assert doc[0:3]._.dependency_distance + assert doc[0]._.dependency_distance + + +@pytest.mark.parametrize("text", ["", "#"]) +def test_dependency_distance_edge(text, nlp): + doc = nlp(text) + for v in doc._.dependency_distance.values(): + v is np.nan + + +@pytest.mark.parametrize( + "text,expected", + [ + (oliver_twist, 3.19), + (secret_garden, 2.32), + (flatland, 3.42), + ], +) +def test_mean_dep_distance(text, expected, nlp): + text = ftfy.fix_text(text) + text = " ".join(text.split()) + doc = nlp(text) + assert ( + pytest.approx(expected, rel=1e-2) + == doc._.dependency_distance["dependency_distance_mean"] + ) + + +@pytest.mark.parametrize( + "text,expected", + [ + (oliver_twist, 1.24), + (secret_garden, 0.85), + (flatland, 1.11), + ], +) +def test_std_dep_distance(text, expected, nlp): + text = ftfy.fix_text(text) + text = " ".join(text.split()) + doc = nlp(text) + assert ( + pytest.approx(expected, rel=1e-2) + == doc._.dependency_distance["dependency_distance_std"] + ) + + +@pytest.mark.parametrize( + "text,expected", + [ + (oliver_twist, 0.41), + (secret_garden, 0.42), + (flatland, 0.45), + ], +) +def test_mean_adj_dep(text, expected, nlp): + text = ftfy.fix_text(text) + text = " ".join(text.split()) + doc = nlp(text) + assert ( + pytest.approx(expected, rel=1e-2) + == doc._.dependency_distance["prop_adjacent_dependency_relation_mean"] + ) + + +@pytest.mark.parametrize( + "text,expected", + [ + (oliver_twist, 0.068), + (secret_garden, 0.098), + (flatland, 0.056), + ], +) +def test_std_adj_dep(text, expected, nlp): + text = ftfy.fix_text(text) + text = " ".join(text.split()) + doc = nlp(text) + assert ( + pytest.approx(expected, rel=1e-2) + == doc._.dependency_distance["prop_adjacent_dependency_relation_std"] + ) diff --git a/textdescriptives/tests/test_descriptive_stats.py b/textdescriptives/tests/test_descriptive_stats.py new file mode 100644 index 00000000..d77698aa --- /dev/null +++ b/textdescriptives/tests/test_descriptive_stats.py @@ -0,0 +1,84 @@ +from spacy.lang.en import English +import pytest +from textdescriptives.components import DescriptiveStatistics + +@pytest.fixture(scope="function") +def nlp(): + nlp = English() + nlp.add_pipe("sentencizer") + nlp.add_pipe("descriptive_stats") + return nlp + +def test_descriptive_stats_integration(nlp): + assert "descriptive_stats" == nlp.pipe_names[-1] + + +def test_descriptive_stats(nlp): + doc = nlp("This is a short and simple sentence") + assert doc._.token_length + assert doc._.sentence_length + assert doc._.syllables + assert doc._.counts + assert doc[0:3]._.token_length + assert doc[0:3]._.counts + + +def test_token_length(nlp): + doc = nlp("Gift cats your prey") + assert doc._.token_length["token_length_mean"] == 4.0 + assert doc._.token_length["token_length_median"] == 4.0 + assert doc._.token_length["token_length_std"] == 0.0 + assert doc[0:2]._.token_length["token_length_mean"] == 4.0 + assert doc[0:2]._.token_length["token_length_median"] == 4.0 + assert doc[0:2]._.token_length["token_length_std"] == 0.0 + + +def test_sentence_length(nlp): + doc = nlp( + "Here is the first sentence. It was pretty short. Let's make another one that's slightly longer and more complex." + ) + assert ( + pytest.approx(6.33, rel=1e-2) == doc._.sentence_length["sentence_length_mean"] + ) + assert ( + pytest.approx(5.0, rel=1e-3) == doc._.sentence_length["sentence_length_median"] + ) + assert pytest.approx(2.62, rel=1e-2) == doc._.sentence_length["sentence_length_std"] + + +def test_syllables_simple(nlp): + doc = nlp("These words are easy") + assert doc._.syllables["syllables_per_token_mean"] == 1.0 + assert doc._.syllables["syllables_per_token_median"] == 1.0 + assert doc._.syllables["syllables_per_token_std"] == 0.0 + + +def test_syllables_complex(nlp): + doc = nlp("This sentence has complicated words in it") + assert pytest.approx(1.43, rel=1e-2) == doc._.syllables["syllables_per_token_mean"] + assert pytest.approx(1.0, rel=1e-3) == doc._.syllables["syllables_per_token_median"] + assert pytest.approx(0.73, rel=1e-2) == doc._.syllables["syllables_per_token_std"] + + +def test_counts(nlp): + doc = nlp( + "Here is the first sentence. It was pretty short. Let's make another one that's slightly longer and more complex." + ) + assert doc._.counts["n_tokens"] == 19 + assert doc._.counts["n_unique_tokens"] == 19 + assert doc._.counts["proportion_unique_tokens"] == 1.0 + assert doc._.counts["n_characters"] == 94 + assert doc._.counts["n_sentences"] == 3 + assert doc[0:6]._.counts["n_tokens"] == 5 + assert doc[0:6]._.counts["n_unique_tokens"] == 5 + assert doc[0:6]._.counts["proportion_unique_tokens"] == 1.0 + assert doc[0:6]._.counts["n_characters"] == 23 + + +@pytest.mark.parametrize("text", ["", "#"]) +def test_descriptive_edge(text, nlp): + doc = nlp(text) + assert doc._.token_length + assert doc._.sentence_length + assert doc._.syllables + assert doc._.counts diff --git a/textdescriptives/tests/test_extractor.py b/textdescriptives/tests/test_extractor.py new file mode 100644 index 00000000..9e304df7 --- /dev/null +++ b/textdescriptives/tests/test_extractor.py @@ -0,0 +1,34 @@ +import textdescriptives as td +import spacy +import pytest +import ftfy + + +@pytest.fixture(scope="function") +def nlp(): + nlp = spacy.load("en_core_web_sm") + nlp.add_pipe("textdescriptives") + return nlp + + +def test_extract_df_single_doc(nlp): + doc = nlp("This is just a cute little text. Actually, it's two sentences.") + td.extract_df(doc) + for metric in ["descriptive_stats", "readability", "dependency_distance"]: + td.extract_df(doc, metrics=metric) + + +def test_extract_df_pipe(nlp): + text = [ + "I wonder how well the function works on multiple documents", + "Very exciting to see, don't you think?", + ] + docs = nlp.pipe(text) + td.extract_df(docs) + +def test_extract_df_subsetters(nlp): + doc = nlp("This is just a cute little text. Actually, it's two sentences.") + df = td.extract_df(doc, include_text=False) + df[td.readability_cols] + df[td.dependency_cols] + df[td.descriptive_stats_cols] diff --git a/textdescriptives/tests/test_load_components.py b/textdescriptives/tests/test_load_components.py new file mode 100644 index 00000000..31c68187 --- /dev/null +++ b/textdescriptives/tests/test_load_components.py @@ -0,0 +1,26 @@ +from spacy.lang.en import English +import pytest +from textdescriptives import TextDescriptives + + +@pytest.fixture(scope="function") +def nlp(): + nlp = English() + nlp.add_pipe("textdescriptives") + return nlp + + +def test_integration(nlp): + assert nlp.pipe_names[-1] == "textdescriptives" + for component in [ + "descriptive_stats", + "readability", + "dependency_distance", + "textdescriptives", + ]: + assert component in nlp.pipe_names + + +def test_simple(nlp): + doc = nlp("This is a short and simple text") + assert doc diff --git a/textdescriptives/tests/test_readability.py b/textdescriptives/tests/test_readability.py new file mode 100644 index 00000000..638108b3 --- /dev/null +++ b/textdescriptives/tests/test_readability.py @@ -0,0 +1,163 @@ +import pytest + +from textdescriptives.components import Readability +from .books import * + +from spacy.lang.en import English + +import numpy as np +import ftfy + + +@pytest.fixture(scope="function") +def nlp(): + nlp = English() + nlp.add_pipe("readability") + return nlp + + +def test_readability_integration(nlp): + assert "readability" == nlp.pipe_names[-1] + + +def test_readability(nlp): + doc = nlp("This is a short and simple sentence") + assert doc._.readability + + +@pytest.mark.parametrize("text", ["", "#"]) +def test_readability_edge(text, nlp): + doc = nlp(text) + for v in doc._.readability.values(): + v is np.nan + + +@pytest.mark.parametrize( + "text,expected", + [ + (oliver_twist, 72.80), + (secret_garden, 92.73), + (flatland, 68.18), + (grade_1, 116.08), + (grade_2, 98.18), + (grade_3, 101.51), + (grade_4, 102.08), + (grade_6, 97.96), + (grade_8, 91.68), + (grade_10, 76.67), + (grade_12, 73.72), + (grade_14, 65.25), + ], +) +def test_flesch_reading_ease(text, expected, nlp): + text = ftfy.fix_text(text) + text = " ".join(text.split()) + doc = nlp(text) + assert pytest.approx(expected, rel=1e-2) == doc._.readability["flesch_reading_ease"] + +@pytest.mark.parametrize( + "text,expected", + [ + (oliver_twist, 9.48), + (secret_garden, 4.04), + (flatland, 12.05), + (grade_1, -1.65), + (grade_2, 1.85), + (grade_3, 1.65), + (grade_4, 1.20), + (grade_6, 2.63), + (grade_8, 4.99), + (grade_10, 6.75), + (grade_12, 6.42), + (grade_14, 10.04), + ], +) +def test_flesch_kincaid_grade(text, expected, nlp): + text = ftfy.fix_text(text) + text = " ".join(text.split()) + doc = nlp(text) + assert pytest.approx(expected, rel=1e-2) == doc._.readability["flesch_kincaid_grade"] + +@pytest.mark.parametrize( + "text,expected", + [ + (oliver_twist, 9.47), + (secret_garden, 6.63), + (flatland, 9.91), + ], +) +def test_smog(text, expected, nlp): + text = ftfy.fix_text(text) + text = " ".join(text.split()) + doc = nlp(text) + assert pytest.approx(expected, rel=1e-2) == doc._.readability["smog"] + +@pytest.mark.parametrize( + "text,expected", + [ + (oliver_twist, 12.19), + (secret_garden, 6.98), + (flatland, 15.05), + ], +) +def test_gunning_fog(text, expected, nlp): + text = ftfy.fix_text(text) + text = " ".join(text.split()) + doc = nlp(text) + assert pytest.approx(expected, rel=1e-2) == doc._.readability["gunning_fog"] + +@pytest.mark.parametrize( + "text,expected", + [ + (oliver_twist, 11.99), + (secret_garden, 5.40), + (flatland, 14.98), + ], +) +def test_automated_readability_index(text, expected, nlp): + text = ftfy.fix_text(text) + text = " ".join(text.split()) + doc = nlp(text) + assert pytest.approx(expected, rel=1e-2) == doc._.readability["automated_readability_index"] + +@pytest.mark.parametrize( + "text,expected", + [ + (oliver_twist, 8.75), + (secret_garden, 6.39), + (flatland, 7.91), + ], +) +def test_coleman_liau_index(text, expected, nlp): + text = ftfy.fix_text(text) + text = " ".join(text.split()) + doc = nlp(text) + assert pytest.approx(expected, rel=1e-2) == doc._.readability["coleman_liau_index"] + +@pytest.mark.parametrize( + "text,expected", + [ + (oliver_twist, 42.70), + (secret_garden, 27.07), + (flatland, 49.87), + ], +) +def test_lix(text, expected, nlp): + text = ftfy.fix_text(text) + text = " ".join(text.split()) + doc = nlp(text) + assert pytest.approx(expected, rel=1e-2) == doc._.readability["lix"] + +@pytest.mark.parametrize( + "text,expected", + [ + (oliver_twist, 4.37), + (secret_garden, 1.81), + (flatland, 5.50), + ], +) +def test_rix(text, expected, nlp): + text = ftfy.fix_text(text) + text = " ".join(text.split()) + doc = nlp(text) + assert pytest.approx(expected, rel=1e-2) == doc._.readability["rix"]