Skip to content

Latest commit

 

History

History
65 lines (55 loc) · 1.81 KB

doi_extraction.md

File metadata and controls

65 lines (55 loc) · 1.81 KB

DOI extraction and matching

library(tidyverse)
library(stringi)
library(biblids) # https://github.com/subugoe/biblids
npl_df <- readr::read_csv("data/bq_doi_20210824.csv")

extract dois

npl_tt <- npl_df %>% 
  mutate(doi = biblids::str_extract_all_doi(npl_text)) %>%
  select(doi, publication_number)

npl_tidy <- tibble(
  as.data.frame(npl_tt$doi),
  publication_number = npl_tt$publication_number) %>%
  pivot_longer(!publication_number) %>%
  filter(!is.na(value)) %>%
  select(-name) %>%
  mutate(doi_cleaned = str_remove(value, "\\.$")) %>%
  mutate(doi_cleaned = str_remove(doi_cleaned, "\\>$")) %>%
  mutate(doi_cleaned = str_remove(doi_cleaned, "\\,$")) %>%
  mutate(doi_cleaned = str_remove(doi_cleaned, "\\;$")) %>%
  select(-value)

npl_tidy
#> # A tibble: 801,500 × 2
#>    publication_number doi_cleaned                   
#>    <chr>              <chr>                         
#>  1 US-9435915-B1      10.1021/cm062619r             
#>  2 US-9467500-B2      10.1109/CCGRID.2012.143       
#>  3 US-9467500-B2      10.1109/CISIS                 
#>  4 US-9493516-B2      10.1371/journal.pone.0019991  
#>  5 US-9493516-B2      10.1016/j.jalz.2011.03.005    
#>  6 US-9493516-B2      10.1186/alzrt62               
#>  7 US-9497379-B2      10.1016/j.ultramic.2003.11.001
#>  8 US-9560489-B2      10.1109/TMC.2011.216          
#>  9 US-9560489-B2      10.1145/1023783.1023786       
#> 10 US-9622484-B2      10.1371/journal.pone.0116871  
#> # … with 801,490 more rows

npl_tidy %>%
  write_csv("data/dois_to_be_checked.csv")

Upload to Google Big Query

library(bigrquery)

patent_dois <- 
  bq_table("api-project-764811344545", "tmp", "patent_dois")
if(bq_table_exists(patent_dois)) 
  bq_table_delete(patent_dois)
bigrquery::bq_table_upload(
  patent_dois,
  npl_tidy)