Skip to content

Commit

Permalink
remove output (NVIDIA#5689) (NVIDIA#5690)
Browse files Browse the repository at this point in the history
Signed-off-by: ericharper <[email protected]>

Signed-off-by: ericharper <[email protected]>

Signed-off-by: ericharper <[email protected]>
Co-authored-by: Eric Harper <[email protected]>
  • Loading branch information
2 people authored and titu1994 committed Mar 24, 2023
1 parent 14c0b78 commit 97fe562
Showing 1 changed file with 34 additions and 145 deletions.
179 changes: 34 additions & 145 deletions tutorials/nlp/Token_Classification-BioMegatron.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 19,
"execution_count": null,
"id": "b7a434f4",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -34,7 +34,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": null,
"id": "challenging-pioneer",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -95,7 +95,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": null,
"id": "federal-beads",
"metadata": {},
"outputs": [],
Expand All @@ -107,22 +107,10 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": null,
"id": "relevant-juvenile",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading NCBI data...\n",
"Archive: DATA_DIR/NCBI_corpus.zip\n",
" inflating: DATA_DIR/NCBI_corpus_development.txt \n",
" inflating: DATA_DIR/NCBI_corpus_testing.txt \n",
" inflating: DATA_DIR/NCBI_corpus_training.txt \n"
]
}
],
"outputs": [],
"source": [
"print('Downloading NCBI data...')\n",
"wget.download('https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/NCBI_corpus.zip', DATA_DIR)\n",
Expand All @@ -131,18 +119,10 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"id": "radical-castle",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9288106\tClustering of missense mutations in the <category=\"Modifier\">ataxia-telangiectasia</category> gene in a <category=\"SpecificDisease\">sporadic T-cell leukaemia</category>.\t<category=\"SpecificDisease\">Ataxia-telangiectasia</category> ( <category=\"SpecificDisease\">A-T</category> ) is a <category=\"DiseaseClass\">recessive multi-system disorder</category> caused by mutations in the ATM gene at 11q22-q23 ( ref . 3 ) . The risk of <category=\"DiseaseClass\">cancer</category> , especially <category=\"DiseaseClass\">lymphoid neoplasias</category> , is substantially elevated in <category=\"Modifier\">A-T</category> patients and has long been associated with chromosomal instability . By analysing <category=\"Modifier\">tumour</category> DNA from patients with <category=\"SpecificDisease\">sporadic T-cell prolymphocytic leukaemia</category> ( <category=\"SpecificDisease\">T-PLL</category> ) , a rare <category=\"DiseaseClass\">clonal malignancy</category> with similarities to a <category=\"SpecificDisease\">mature T-cell leukaemia</category> seen in <category=\"SpecificDisease\">A-T</category> , we demonstrate a high frequency of ATM mutations in <category=\"SpecificDisease\">T-PLL</category> . In marked contrast to the ATM mutation pattern in <category=\"SpecificDisease\">A-T</category> , the most frequent nucleotide changes in this <category=\"DiseaseClass\">leukaemia</category> were missense mutations . These clustered in the region corresponding to the kinase domain , which is highly conserved in ATM-related proteins in mouse , yeast and Drosophila . The resulting amino-acid substitutions are predicted to interfere with ATP binding or substrate recognition . Two of seventeen mutated <category=\"SpecificDisease\">T-PLL</category> samples had a previously reported <category=\"Modifier\">A-T</category> allele . In contrast , no mutations were detected in the p53 gene , suggesting that this <category=\"Modifier\">tumour</category> suppressor is not frequently altered in this <category=\"DiseaseClass\">leukaemia</category> . Occasional missense mutations in ATM were also found in <category=\"Modifier\">tumour</category> DNA from patients with <category=\"SpecificDisease\">B-cell non-Hodgkins lymphomas</category> ( <category=\"SpecificDisease\">B-NHL</category> ) and a <category=\"Modifier\">B-NHL</category> cell line . The evidence of a significant proportion of loss-of-function mutations and a complete absence of the normal copy of ATM in the majority of mutated <category=\"DiseaseClass\">tumours</category> establishes somatic inactivation of this gene in the pathogenesis of <category=\"SpecificDisease\">sporadic T-PLL</category> and suggests that ATM acts as a <category=\"Modifier\">tumour</category> suppressor . As constitutional DNA was not available , a putative hereditary predisposition to <category=\"SpecificDisease\">T-PLL</category> will require further investigation . . \n"
]
}
],
"outputs": [],
"source": [
"# If you want to see more examples, you can explore the text of the corpus using the file browser to the left, or open files directly, for example typing a command like the following in a code-cell:\n",
"\n",
Expand All @@ -169,21 +149,10 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": null,
"id": "present-interference",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'DATA_DIR/NER/test.tsv'"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"NER_DATA_DIR = f'{DATA_DIR}/NER'\n",
"wget.download('https://raw.githubusercontent.com/spyysalo/ncbi-disease/master/conll/train.tsv', NER_DATA_DIR)\n",
Expand All @@ -193,21 +162,10 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"id": "identical-figure",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total 1.5M\n",
"-rw-r--r-- 1 root root 196K Apr 8 00:56 devel.tsv\n",
"-rw-r--r-- 1 root root 201K Apr 8 00:56 test.tsv\n",
"-rw-r--r-- 1 root root 1.1M Apr 8 00:56 train.tsv\n"
]
}
],
"outputs": [],
"source": [
"!ls -lh $NER_DATA_DIR"
]
Expand All @@ -222,7 +180,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": null,
"id": "utility-wesley",
"metadata": {},
"outputs": [],
Expand All @@ -232,44 +190,20 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": null,
"id": "suited-jenny",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'import_from_iob_format (2).py'"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/token_classification/data/import_from_iob_format.py')"
]
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": null,
"id": "sensitive-victoria",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[NeMo I 2022-04-08 00:57:03 import_from_iob_format:119] Processing DATA_DIR/NER/train.tsv\n",
"[NeMo I 2022-04-08 00:57:03 import_from_iob_format:124] Processing of the DATA_DIR/NER/train.tsv is complete\n",
"[NeMo I 2022-04-08 00:57:06 import_from_iob_format:119] Processing DATA_DIR/NER/dev.tsv\n",
"[NeMo I 2022-04-08 00:57:06 import_from_iob_format:124] Processing of the DATA_DIR/NER/dev.tsv is complete\n",
"[NeMo I 2022-04-08 00:57:08 import_from_iob_format:119] Processing DATA_DIR/NER/test.tsv\n",
"[NeMo I 2022-04-08 00:57:08 import_from_iob_format:124] Processing of the DATA_DIR/NER/test.tsv is complete\n"
]
}
],
"outputs": [],
"source": [
"! python import_from_iob_format.py --data_file=$NER_DATA_DIR/train.tsv\n",
"! python import_from_iob_format.py --data_file=$NER_DATA_DIR/dev.tsv\n",
Expand All @@ -286,54 +220,20 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": null,
"id": "sound-surgeon",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Identification of APC2 , a homologue of the adenomatous polyposis coli tumour suppressor . \n",
"The adenomatous polyposis coli ( APC ) tumour - suppressor protein controls the Wnt signalling pathway by forming a complex with glycogen synthase kinase 3beta ( GSK - 3beta ) , axin / conductin and betacatenin . \n",
"Complex formation induces the rapid degradation of betacatenin . \n",
"In colon carcinoma cells , loss of APC leads to the accumulation of betacatenin in the nucleus , where it binds to and activates the Tcf - 4 transcription factor ( reviewed in [ 1 ] [ 2 ] ) . \n",
"Here , we report the identification and genomic structure of APC homologues . \n",
"Mammalian APC2 , which closely resembles APC in overall domain structure , was functionally analyzed and shown to contain two SAMP domains , both of which are required for binding to conductin . \n",
"Like APC , APC2 regulates the formation of active betacatenin - Tcf complexes , as demonstrated using transient transcriptional activation assays in APC - / - colon carcinoma cells . \n",
"Human APC2 maps to chromosome 19p13 . \n",
"3 . \n",
"APC and APC2 may therefore have comparable functions in development and cancer . \n"
]
}
],
"outputs": [],
"source": [
"!head $NER_DATA_DIR/text_train.txt"
]
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": null,
"id": "spectacular-strain",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"O O O O O O O O B-Disease I-Disease I-Disease I-Disease O O \n",
"O B-Disease I-Disease I-Disease I-Disease I-Disease I-Disease I-Disease O O O O O O O O O O O O O O O O O O O O O O O O O O O O O \n",
"O O O O O O O O O \n",
"O B-Disease I-Disease O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O \n",
"O O O O O O O O O O O O O \n",
"O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O \n",
"O O O O O O O O O O O O O O O O O O O O O O O O O O B-Disease I-Disease O O \n",
"O O O O O O O \n",
"O O \n",
"O O O O O O O O O O O B-Disease O \n"
]
}
],
"outputs": [],
"source": [
"!head $NER_DATA_DIR/labels_train.txt"
]
Expand Down Expand Up @@ -377,7 +277,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": null,
"id": "speaking-grant",
"metadata": {},
"outputs": [],
Expand All @@ -389,18 +289,10 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": null,
"id": "demanding-ballet",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"config file is already exists\n"
]
}
],
"outputs": [],
"source": [
"# download the model's configuration file \n",
"config_dir = WORK_DIR + '/configs/'\n",
Expand All @@ -414,18 +306,10 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": null,
"id": "criminal-outdoors",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WORK_DIR/configs/token_classification_config.yaml\n"
]
}
],
"outputs": [],
"source": [
"# this line will print the entire config of the model\n",
"config_path = f'{WORK_DIR}/configs/{MODEL_CONFIG}'\n",
Expand All @@ -438,7 +322,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": null,
"id": "informed-purse",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -784,7 +668,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "Python 3.8.0 ('test_r1.10.0_pip')",
"language": "python",
"name": "python3"
},
Expand All @@ -798,9 +682,14 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.8.0"
},
"vscode": {
"interpreter": {
"hash": "30504a7d8129b3c45f1978a1de0804c162ca7894685891a914c7f1dc31e854c4"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
}

0 comments on commit 97fe562

Please sign in to comment.