-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyse_pubmed.sh
31 lines (24 loc) · 1.21 KB
/
analyse_pubmed.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/bin/bash
# This script is only a wrapper around 2 adapted tools to analyse the pubmed
# corpus data. The result file generated by pubmed_trend_analysis can then be
# analysed with create_heatmaps.r and create_wordclouds.py to generate the plots
# Orginal version of the tools can be found at:
# https://github.com/lab42open-team/pubmed_trend_analysis
# https://github.com/sahansera/medline-pubmed-extractor
# Download and unzip the pubmed data
mkdir -p pubmed/xml/
for i in $(seq -w 0001 1219)
do
wget https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n${i}.xml.gz -O pubmed/xml/pubmed24n${i}.xml.gz
gzip -d pubmed/xml/pubmed24n${i}.xml.gz
done
echo "Converting xml files to tsv and counting the total number of articles per year"
ehco "This may take few minutes"
# Convert xml files to tsv files using medline-pubmed-extractor
mkdir -p pubmed/tsv/
tools/medline-pubmed-extractor/MedlineExtractor/bin/Release/MedlineExtractor pubmed/xml/ pubmed/tsv/
gzip pubmed/tsv/*
# Count the total number of articles per year
python count_total_publications.py
# Extract the articles containing the keywords using pubmed_trend_analysis
tools/pubmed_trend_analysis/dig_analysis.sh -k omics_keywords.txt -d pubmed/tsv/ -p omics