generated from NCBI-Codeathons/codeathon-team-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_amr_presence_absence.R
executable file
·75 lines (50 loc) · 2.14 KB
/
create_amr_presence_absence.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
library(readr)
library(tidyr)
library(dplyr)
DF <- read_tsv("config/amrfinder.tsv", col_names = TRUE, show_col_types = FALSE)
# Replace the spaces with "_" in the column names ------------------------
# Get the current column names
col_names <- colnames(DF)
# Replace spaces with underscores
col_names <- gsub(" ", "_", col_names)
# Assign the modified names back to the data frame
colnames(DF) <- col_names
# Remove '#' from column names
colnames(DF) <- gsub("#", "", colnames(DF))
#Remove all entries that do not contain "Neisseria gonorrhoea"
#DF <- DF %>%
# filter(grepl("Neisseria gonorrhoeae", Scientific_name))
#change entries to "EXCLUDE" for "INTERNAL_STOP|PARTIALX|PARTIALP" in the "Method" column
#DF <- DF %>%
# mutate(
# Subtype = if_else(grepl("INTERNAL_STOP|PARTIALX|PARTIALP", Method), "EXCLUDE", Subtype),
# Subclass = if_else(grepl("INTERNAL_STOP|PARTIALX|PARTIALP", Method), "EXCLUDE", Subclass)
# )
# Filter out rows with the specified terms in the Method column
DF <- DF %>%
filter(!grepl("INTERNAL_STOP|PARTIALX|PARTIALP", Method))
# Restructure the DF and assign 1 for present and 0 for absent -----------
new_DF <- DF %>%
select(BioSample, Element_symbol) %>%
distinct() %>%
mutate(present = 1) %>%
pivot_wider(names_from = Element_symbol, values_from = present, values_fill = list(present = 0))
write.csv(new_DF, file = "data/biosample_gene.csv", row.names = FALSE)
# Code to check to make sure this working as expected ----------------
count_ones_zeros <- function(df, sample_id) {
# Filter the dataframe for the specific BioSample
sample_row <- df[df$BioSample == sample_id, -which(names(df) == "BioSample")]
# Return counts of ones and zeros
ones_count <- sum(sample_row == 1, na.rm = TRUE)
zeros_count <- sum(sample_row == 0, na.rm = TRUE)
return(list(ones = ones_count, zeros = zeros_count))
}
# Apply the function to a specific BioSample to check for accuracy
result <- count_ones_zeros(new_DF, "SAMD00099414")
print(result)
#check the number of unique biosamples
unique_terms <- DF %>%
distinct(BioSample)
#check the number of unique biosamples
unique_terms <- DF %>%
distinct(Element_symbol)