-
Notifications
You must be signed in to change notification settings - Fork 1
/
predict_utils.R
105 lines (92 loc) · 5.61 KB
/
predict_utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
generate_grid <- function(added=exp(mean(data$logADD))-1,
removed=exp(mean(data$logDEL))-1,
complexity=exp(mean(data$logCOMPLEX))-1,
duplicates=exp(mean(data$logDUP))-1) {
numbers <- data.frame(A=(log(added+1)-mean(data$logADD))/sd(data$logADD),
R=(log(removed+1)-mean(data$logDEL))/sd(data$logDEL),
C=(log(complexity+1)-mean(data$logCOMPLEX))/sd(data$logCOMPLEX),
D=(log(duplicates+1)-mean(data$logDUP))/sd(data$logDUP),
added=added,
removed=removed,
complexity=complexity,
duplicates=duplicates)
repos <- data.frame(repo=data$repo) |> distinct()
teams <- data.frame(team=data$committerteam) |> distinct()
grid <- expand_grid(numbers, repos, teams)
return(grid)
}
posterior_predict_by_team_and_repo <- function(model,
added=exp(mean(data$logADD))-1,
removed=exp(mean(data$logDEL))-1,
complexity=exp(mean(data$logCOMPLEX))-1,
duplicates=exp(mean(data$logDUP))-1,
summary=function(x) { length(which(x==0))/length(x) }) {
items <- 10000
grid <- generate_grid(added, removed, complexity, duplicates)
summary <- posterior_predict(model, newdata=grid, ndraws=items, allow_new_levels=TRUE) |> data.frame() |> sapply(summary)
grid$summary <- summary
return(grid)
}
heatmap_by_team_and_repo <- function(postpredict, summation, decimals=2) {
added <- postpredict |> select(added) |> distinct()
removed <- postpredict |> select(removed) |> distinct()
complexity <- postpredict |> select(complexity) |> distinct()
duplicates <- postpredict |> select(duplicates) |> distinct()
p <- postpredict |> ggplot(aes(x=team, y=repo)) +
geom_tile(aes(fill=summary)) +
geom_text(aes(label=round(summary, decimals)), color="white") +
xlab("team") + ylab("repo") +
ggtitle(paste(summation, "by team and repo"), paste0("added: ", round(added$added, 0), " removed: ", round(removed$removed, 0), " complexity: ", round(complexity$complexity, 0), " duplicates: ", round(duplicates$duplicates, 0)))
return(p)
}
predict_for_team <- function(model, team, repo,
added=exp(mean(data$logADD))-1,
removed=exp(mean(data$logDEL))-1,
complexity=exp(mean(data$logCOMPLEX))-1,
duplicates=exp(mean(data$logDUP))-1) {
numbers <- data.frame(A=(log(added+1)-mean(data$logADD))/sd(data$logADD),
R=(log(removed+1)-mean(data$logDEL))/sd(data$logDEL),
C=(log(complexity+1)-mean(data$logCOMPLEX))/sd(data$logCOMPLEX),
D=(log(duplicates+1)-mean(data$logDUP))/sd(data$logDUP),
added=added,
removed=removed,
complexity=complexity,
duplicates=duplicates)
grid <- expand_grid(numbers, repo, team)
items <- 10000
# data <- posterior_predict(model, newdata=grid, ndraws=items, allow_new_levels=TRUE) |> data.frame()
data <- predicted_draws(model, newdata=grid, ndraws=items, allow_new_levels = TRUE ) |> data.frame()
return(data)
}
# input is the output from predict_for_team
plot_cumulative_prob_of_duplicates <- function(predictions) {
added <- predictions |> select(added) |> distinct()
removed <- predictions |> select(removed) |> distinct()
complexity <- predictions |> select(complexity) |> distinct()
duplicates <- predictions |> select(duplicates) |> distinct()
predictions |> mutate(predict_INTROD = .prediction) |> group_by(team, repo, predict_INTROD) |> ggplot(aes(x=predict_INTROD, color=team)) + stat_ecdf() + facet_wrap(~ repo) +
xlab("Maximum number of introduced duplicates") +
ggtitle("Cumulative probability of introduced duplicates",
paste0("added: ", round(added$added, 0), " removed: ", round(removed$removed, 0), " complexity: ", round(complexity$complexity, 0), " duplicates: ", round(duplicates$duplicates, 0))) +
scale_color_manual(values=COLOR_BY_TEAM) + theme_bw()
}
halfeye_per_team <- function(postpredict) {
added <- postpredict |> select(added) |> distinct()
removed <- postpredict |> select(removed) |> distinct()
complexity <- postpredict |> select(complexity) |> distinct()
duplicates <- postpredict |> select(duplicates) |> distinct()
p <- postpredict |> group_by(team, repo) |> ggplot(aes(x=.prediction, color=repo)) +
stat_halfeye(fill="black") + facet_wrap(~ team) +
ggtitle(paste("Prediction by team"), paste0("added: ", round(added$added, 0), " removed: ", round(removed$removed, 0), " complexity: ", round(complexity$complexity, 0), " duplicates: ", round(duplicates$duplicates, 0)))
return (p)
}
histogram_per_team <- function(postpredict) {
added <- postpredict |> select(added) |> distinct()
removed <- postpredict |> select(removed) |> distinct()
complexity <- postpredict |> select(complexity) |> distinct()
duplicates <- postpredict |> select(duplicates) |> distinct()
p <- postpredict |> group_by(team, repo) |> ggplot(aes(x=.prediction, color=repo)) +
geom_histogram(binwidth = 1) + facet_wrap(~ team) +
ggtitle(paste("Prediction by team"), paste0("added: ", round(added$added, 0), " removed: ", round(removed$removed, 0), " complexity: ", round(complexity$complexity, 0), " duplicates: ", round(duplicates$duplicates, 0)))
return (p)
}