Merge branch 'master' of github.com:rxn4chemistry/rxnfp

rxn4chemistry · Oct 14, 2020 · 9c8e4a2 · 9c8e4a2
2 parents 73ace71 + 89248ae
commit 9c8e4a2
Show file tree

Hide file tree

Showing 6 changed files with 15,654 additions and 60 deletions.
diff --git a/docs/fine_tune_bert_on_uspto_1k_tpl.html b/docs/fine_tune_bert_on_uspto_1k_tpl.html
@@ -53,27 +53,12 @@
 </div>
 </div>
 
-<div class="output_wrapper">
-<div class="output">
-
-<div class="output_area">
-
-<div class="output_subarea output_stream output_stderr output_text">
-<pre><span class="ansi-blue-intense-fg ansi-bold">wandb</span>: <span class="ansi-yellow-fg">WARNING</span> W&amp;B installed but not logged in.  Run `wandb login` or set the WANDB_API_KEY env variable.
-This extension has only been tested with simpletransformers==0.34.4
-</pre>
-</div>
-</div>
-
-</div>
-</div>
-
 </div>
     {% endraw %}
 
 <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h2 id="Track-the-training">Track the training<a class="anchor-link" href="#Track-the-training"> </a></h2><p>Will be using wandb to keep track of our training. You can use the an account on <a href="https://www.wandb.com">wandb</a> or create an own instance following the instruction in the <a href="https://docs.wandb.com/self-hosted">documentation</a>.</p>
+<h2 id="Track-the-training">Track the training<a class="anchor-link" href="#Track-the-training"> </a></h2><p>We will be using wandb to keep track of our training. You can use the an account on <a href="https://www.wandb.com">wandb</a> or create an own instance following the instruction in the <a href="https://docs.wandb.com/self-hosted">documentation</a>.</p>
 <p>If you then create an <code>.env</code> file in the root folder and specify the <code>WANDB_API_KEY=</code> (and the <code>WANDB_BASE_URL=</code>), you can use dotenv to load those enviroment variables.</p>
 
 </div>
@@ -135,12 +120,77 @@ <h2 id="Loading-the-training-data">Loading the training data<a class="anchor-lin
 </div>
 </div>
 
+<div class="output_wrapper">
+<div class="output">
+
+<div class="output_area">
+
+
+<div class="output_html rendered_html output_subarea output_execute_result">
+<div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
+
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>canonical_rxn</th>
+      <th>labels</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>61373</th>
+      <td>C1CCOC1.CC(C)(C)OC(=O)NCC(=O)NCC(=O)O.CCCCCCCC...</td>
+      <td>645</td>
+    </tr>
+    <tr>
+      <th>135235</th>
+      <td>CCN(CC)CCOc1ccc(Cn2c3ccc(OC)cc3c3oc4cc(OCc5ccc...</td>
+      <td>23</td>
+    </tr>
+    <tr>
+      <th>434023</th>
+      <td>CCN(C(C)C)C(C)C.CS(=O)(=O)Cl.C[C@@H]1CCC[C@H](...</td>
+      <td>0</td>
+    </tr>
+    <tr>
+      <th>188234</th>
+      <td>CC(C)(C)[Si](C)(C)OCCn1ccc(N)n1.Cc1cccc(C)n1.C...</td>
+      <td>795</td>
+    </tr>
+    <tr>
+      <th>406062</th>
+      <td>CCOC(C)=O.CO.COc1cc(C(=O)O)c(Cl)cc1Br.C[Si](C)...</td>
+      <td>195</td>
+    </tr>
+  </tbody>
+</table>
+</div>
+</div>
+
+</div>
+
+</div>
+</div>
+
 </div>
     {% endraw %}
 
 <div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h2 id="Make-splits">Make splits<a class="anchor-link" href="#Make-splits"> </a></h2><p>The data was already shuffled. We take 10% as test set and a small validation set 604 reactions to keep track of the performance during training.</p>
+<h2 id="Make-splits">Make splits<a class="anchor-link" href="#Make-splits"> </a></h2><p>The data was already shuffled. We take 10% as test set and a small validation set of 604 reactions to keep track of the performance during training.</p>
 
 </div>
 </div>
@@ -207,7 +257,7 @@ <h2 id="Load-model-pretrained-on-a-Masked-Language-Modeling-task-and-train">Load
 
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">model_path</span> <span class="o">=</span>  <span class="n">pkg_resources</span><span class="o">.</span><span class="n">resource_filename</span><span class="p">(</span><span class="s2">&quot;rxnfp&quot;</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;models/transformers/bert_mlm_1k_tpl&quot;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">model_path</span> <span class="o">=</span>  <span class="n">pkg_resources</span><span class="o">.</span><span class="n">resource_filename</span><span class="p">(</span><span class="s2">&quot;rxnfp&quot;</span><span class="p">,</span> <span class="s2">&quot;models/transformers/bert_mlm_1k_tpl&quot;</span><span class="p">)</span>
 <span class="n">model</span> <span class="o">=</span> <span class="n">SmilesClassificationModel</span><span class="p">(</span><span class="s2">&quot;bert&quot;</span><span class="p">,</span> <span class="n">model_path</span><span class="p">,</span> <span class="n">num_labels</span><span class="o">=</span><span class="nb">len</span><span class="p">(</span><span class="n">final_train_df</span><span class="o">.</span><span class="n">labels</span><span class="o">.</span><span class="n">unique</span><span class="p">()),</span> <span class="n">args</span><span class="o">=</span><span class="n">model_args</span><span class="p">,</span> <span class="n">use_cuda</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">())</span>
 </pre></div>
 
@@ -263,7 +313,7 @@ <h2 id="Load-trained-model-(that-we've-trained-using-this-script)">Load trained
 
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">train_model_path</span> <span class="o">=</span>  <span class="n">pkg_resources</span><span class="o">.</span><span class="n">resource_filename</span><span class="p">(</span><span class="s2">&quot;rxnfp&quot;</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;models/transformers/bert_class_1k_tpl&quot;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">train_model_path</span> <span class="o">=</span>  <span class="n">pkg_resources</span><span class="o">.</span><span class="n">resource_filename</span><span class="p">(</span><span class="s2">&quot;rxnfp&quot;</span><span class="p">,</span> <span class="s2">&quot;models/transformers/bert_class_1k_tpl&quot;</span><span class="p">)</span>
 
 <span class="n">model</span> <span class="o">=</span> <span class="n">SmilesClassificationModel</span><span class="p">(</span><span class="s2">&quot;bert&quot;</span><span class="p">,</span> <span class="n">train_model_path</span><span class="p">,</span> <span class="n">use_cuda</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">())</span>
 </pre></div>