Merge pull request #193 from NLPatVCU/hotfix

Hotfix
NLPatVCU · May 25, 2020 · 5e7db40 · 5e7db40
2 parents d3f5591 + 201a960
commit 5e7db40
Show file tree

Hide file tree

Showing 7 changed files with 9 additions and 155 deletions.
diff --git a/guide/models/clinical_notes_model.md b/guide/models/clinical_notes_model.md
diff --git a/guide/models/epa_systematic_review_model.md b/guide/models/epa_systematic_review_model.md
diff --git a/guide/models/nanomedicine_drug_labels.md b/guide/models/nanomedicine_drug_labels.md
diff --git a/guide/walkthrough/data_management.md b/guide/walkthrough/data_management.md
@@ -46,15 +46,13 @@ MedaCy **does not** alter the data you load in any way - it only reads from it.
 
 A common data work flow might look like this.
 
-running:
-
-```
+```pythonstub
 >>> from medacy.data.datset import Dataset
 >>> from medacy.pipeline_components.feature_overlayers.metamap.metamap import MetaMap
 
 >>> dataset = Dataset('/home/medacy/data')
 >>> for data_file in dataset:
-...  data_file.file_name
+...     data_file.file_name
 'file_one'
 'file_two'
 >>> data
@@ -63,7 +61,7 @@ running:
 False
 >>> metamap = Metamap('/home/path/to/metamap/binary')
 >>> with metamap:
-... data.metamap(metamap)
+...     data.metamap(metamap)
 data.is_metamapped()
 True
 ```
@@ -81,47 +79,6 @@ home/medacy/data
     └── file_two.metamapped
 ```
 
-
-
-## Loading a medaCy compatible dataset
-Using a *medaCy compatible dataset* package to manage your training data insures that data is easy and efficient to access, versioned for replicability, and distributable (selectively!).
-
-A *medaCy compatible dataset* is python package wrapping data that can be hooked into medaCy. We can install a *medaCy compatible dataset* just like any python package. For instance,
-
-
-`pip install https://github.com/NanoNLP/medaCy_dataset_end/archive/v1.0.3.tar.gz#egg=medacy_dataset_end-1.0.3`
-
-will install `v1.0.03` of the [END](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5644562/) dataset. Alternatively,
-
-`pip install git+https://github.com/NanoNLP/medaCy_dataset_end.git`
-
-will install the latest version of the [END](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5644562/) dataset.
-
-After you have installed a *medaCy compatible dataset*, loading it returns a configured `Dataset` object alongside meta-data in a `tuple` as follows:
-
-```python
-from medacy.data import Dataset
-
-training_dataset, evaluation_dataset, meta_data = Dataset.load_external('medacy_dataset_end')
-
-```
-
-alternatively, import the datasets package and directly call the load method:
-
-```python
-import medacy_dataset_end
-
-training_dataset, evaluation_dataset, meta_data = medacy_dataset_end.load()
-
-print(meta_data['entities']) #entities this dataset annotates
-print(meta_data['relations']) #relations this dataset annotates (END has None)
-
-training_dataset = medacy_dataset_end.load_training_dataset() #access just training
-
-evaluation_dataset = medacy_dataset_end.load_evaluation_dataset() #access just evaluation
-
-```
-
 ## Using a Dataset
 A *Dataset* is utilized for two main tasks:
 
@@ -139,7 +96,7 @@ from medacy.pipelines import FDANanoDrugLabelPipeline
 dataset = Dataset('/home/medacy/data')
 entities = ['Nanoparticle', 'Dose']
 pipeline = FDANanoDrugLabelPipeline(entities=entities)
-model = Model(pipeline, n_jobs=1)
+model = Model(pipeline)
 
 model.fit(dataset)
 ```
@@ -172,4 +129,3 @@ By default, this creates a sub-directory in your prediction dataset named *predi
 ```
 
 where all files under *predictions* are the trained models predictions over your test data.
-
diff --git a/guide/walkthrough/model_training.md b/guide/walkthrough/model_training.md
@@ -64,15 +64,15 @@ The previously mentioned components make up a medaCy model. In summary training
 
 ```python
 import os
-from medacy.data import Dataset
+from medacy.data.dataset import Dataset
 from medacy.pipelines import ClinicalPipeline
-from medacy.ner import Model
+from medacy.model.model import Model
 
 entities = ['Drug', 'Strength']
 
 training_dataset = Dataset('/home/medacy/clinical_training_data/')
 pipeline = ClinicalPipeline(metamap=None, entities=entities)
-model = Model(pipeline, n_jobs=30) #distribute documents between 30 processes during training and prediction
+model = Model(pipeline)
 
 output_file_path = '/home/medacy/clinical_model.pickle'
 # Protect against running fit() without having a valid place to save it
@@ -81,74 +81,4 @@ assert os.path.isfile(output_file_path)
 model.fit(training_dataset)
 
 model.dump(output_file_path)
-
-
-```
-
-The `ClinicalPipeline` source looks like this:
-
-```python
-import sklearn_crfsuite
-import spacy
-
-from medacy.pipeline_components.feature_extractors.discrete_feature_extractor import FeatureExtractor
-from medacy.pipeline_components.feature_overlayers.metamap.metamap import MetaMap
-from medacy.pipeline_components.feature_overlayers.metamap.metamap_component import MetaMapOverlayer
-from medacy.pipeline_components.tokenizers.clinical_tokenizer import ClinicalTokenizer
-from medacy.pipelines.base.base_pipeline import BasePipeline
-
-
-class ClinicalPipeline(BasePipeline):
-    """
-    A pipeline for clinical named entity recognition. A special tokenizer that breaks down a clinical document
-    to character level tokens defines this pipeline. It was created for the extraction of ADE related entities
-    from the 2018 N2C2 Shared Task.
-
-    Created by Andiy Mulyar (andriymulyar.com) of NLP@VCU
-    """
-
-
-    def __init__(self, entities, metamap=None, **kwargs):
-        """
-        Create a pipeline with the name 'clinical_pipeline' utilizing
-        by default spaCy's small english model.
-
-        :param entities: a list of entities to use in this pipeline.
-        :param metamap: an instance of MetaMap if metamap should be used, defaults to None.
-        """
-
-        super().__init__(entities, spacy_pipeline=spacy.load("en_core_web_sm"))
-
-        if isinstance(metamap, MetaMap):
-            self.add_component(MetaMapOverlayer, metamap)
-
-    def get_learner(self):
-        return ("CRF_l2sgd",
-                sklearn_crfsuite.CRF(
-                    algorithm='l2sgd',
-                    c2=0.1,
-                    max_iterations=100,
-                    all_possible_transitions=True
-                )
-            )
-
-    def get_tokenizer(self):
-        return ClinicalTokenizer(self.spacy_pipeline)
-
-    def get_feature_extractor(self):
-        return FeatureExtractor(window_size=3, spacy_features=['pos_', 'shape_', 'prefix_', 'suffix_', 'text'])
 ```
-
-
-The `__init__` method defines pipeline meta-data along with initializing the sequence of components the pipeline will use to annotate custom token attributes over the document. Components are imported and initialized as part of the pipeline by calling the `add_component` method. The first paramater is a component and the subsequent parameters are any arguments that are passed to the component on initialization. Token attributes beginning with `feature_` are automically collected by the `FeatureExtractor` initialized in the `get_feature_extractor` method.  Note the instantiation of the `FeatureExtractor` allows the definition of an array of `spacy_features` to utilize - these can be any attribute of a spaCy [Token](https://spacy.io/api/token#attributes).
-
-The `get_learner` method returns a configured instance of the machine learning algorithm to utilize for training a model. Currently only CRF models wrapped by the package [sklearn-crfsuite](https://sklearn-crfsuite.readthedocs.io/en/latest/) are allowed.
-
-The `get_tokenizer` method returns a configured medaCy tokenizer. An interface for building and maintaining a tokenizer is provided and the pattern from `ClinicalTokenizer` can be followed for engineering your own.
-
-The `get_feature_extractor` method returns a configured feature extractor. This defines how and what features from annotated documents are collected to be fed into the model during training or prediction. The example configuration means that all medaCy annotated features and the specified `spacy_features` are collected in a range of three tokens to the left and three tokens to the right of every token (ie. the `window_size`).
-
-
-
-
-
diff --git a/guide/walkthrough/model_utilization.md b/guide/walkthrough/model_utilization.md
@@ -31,11 +31,9 @@ One of medaCy's most powerful features is the ability to maintain, version and d
 Once a model has been [packaged](packaging_a_medacy_model.md) and installed it can be used as follows:
 
 ```python
-import medacy_model_clinical_notes #import the python package wrapping the model
 from medacy.model.model import Model
 
 model = Model.load_external('medacy_model_clinical_notes')
-
 annotations = model.predict("The patient took 5 mg of aspirin.")
 ```
 

diff --git a/medacy/__main__.py b/medacy/__main__.py
@@ -8,7 +8,7 @@
 import logging
 
 from medacy.data.dataset import Dataset
-from medacy.model.model import Model
+from medacy.model.model import Model, DEFAULT_NUM_FOLDS
 from medacy.pipelines import bert_pipeline
 from medacy.tools.json_to_pipeline import json_to_pipeline
 
@@ -145,7 +145,7 @@ def main():
 
     # Cross Validation arguments
     parser_validate = subparsers.add_parser('validate', help='Cross validate a model on a given dataset.')
-    parser_validate.add_argument('-k', '--k_folds', default=5, type=int, help='Number of folds to use for cross-validation.')
+    parser_validate.add_argument('-k', '--k_folds', default=DEFAULT_NUM_FOLDS, type=int, help='Number of folds to use for cross-validation.')
     parser_validate.add_argument('-gt', '--groundtruth', type=str, default=None, help='Directory to write groundtruth files.')
     parser_validate.add_argument('-pd', '--predictions', type=str, default=None, help='Directory to write prediction files.')
     parser_validate.set_defaults(func=cross_validate)