sedaData.Rmd

---
title: "Binsmooth version 0.3.0 Tests"
author: "David J. Hunter"
date: "January 15, 2022"
output: html_document
---

## Try using the SEDA test data

We can use grade 6 math in Texas, 2017-2018 school year

```{r}
library(tidyverse)
sedaMth6 <- read_csv("seda_geodist_long_cs_4.1.csv", guess_max = 10000) %>%
  filter(year == 2018, stateabb == "TX", grade == 6, subject == "mth")
       # cs_mn_all is the y*-scale mean, cs_mnse_all is the SE
rawGrade6 <- read_csv("grade_6.csv") %>%
  select(YEAR, REGION, DISTRICT, DNAME, GRADE, m_all_docs_n, m_all_d, 
         m_all_unsatgl_nm, #BIN1
         m_all_approgl_nm, #BIN2
         m_all_meetsgl_nm, #BIN3
         m_all_mastrgl_nm, #BIN4
         m_all_rs) # average scale score
```
Join by district name.

```{r}
mth6 <- inner_join(rawGrade6, sedaMth6, by = c("DNAME" = "sedaleaname")) %>%
  select(DNAME, m_all_docs_n, m_all_d, 
         m_all_unsatgl_nm, #BIN1
         m_all_approgl_nm, #BIN2
         m_all_meetsgl_nm, #BIN3
         m_all_mastrgl_nm, #BIN4
         m_all_rs, # average scale score 
         cs_mn_all, # mean on y* scale
         cs_mnse_all) %>%
  drop_na() %>%
  filter(m_all_unsatgl_nm * m_all_approgl_nm * m_all_meetsgl_nm * m_all_mastrgl_nm != 0)
```

## Correlate SEDA means with actual means

```{r}
ggplot(mth6, mapping = aes(x = m_all_rs, y = cs_mn_all)) + 
  geom_point() +
  geom_smooth(formula = "y~x", method = "lm", color = "blue") +
  xlab("Actual reported mean") +
  ylab("HETOP scaled mean")
  
```

Now compute midpoint means:

```{r}
binEdges <- c(1068, 1536, 1653, 1772, 2137)
mp <- (binEdges[1:4] + binEdges[2:5])/2
mth6 %>%
  mutate(mpMean = (mp[1]*m_all_unsatgl_nm + mp[2]*m_all_approgl_nm + 
                     mp[3]*m_all_meetsgl_nm + mp[4]*m_all_mastrgl_nm)/
           (m_all_unsatgl_nm + m_all_approgl_nm + m_all_meetsgl_nm + m_all_mastrgl_nm)) %>%
  ggplot(aes(x = m_all_rs, y = mpMean)) +
  geom_point() + coord_fixed(ratio = 1)
```

Looks like midpoint means underestimate low means (right skew)?


## Max, Min, and Trapezoid means

```{r}
lEnd <- binEdges[1:4]
rEnd <- binEdges[2:5]
mth6 %>%
  mutate(minMean = (lEnd[1]*m_all_unsatgl_nm + lEnd[2]*m_all_approgl_nm + 
                     lEnd[3]*m_all_meetsgl_nm + lEnd[4]*m_all_mastrgl_nm)/
           (m_all_unsatgl_nm + m_all_approgl_nm + m_all_meetsgl_nm + m_all_mastrgl_nm),
         maxMean = (rEnd[1]*m_all_unsatgl_nm + rEnd[2]*m_all_approgl_nm + 
                     rEnd[3]*m_all_meetsgl_nm + rEnd[4]*m_all_mastrgl_nm)/
           (m_all_unsatgl_nm + m_all_approgl_nm + m_all_meetsgl_nm + m_all_mastrgl_nm),
         mpMean = (minMean + maxMean)/2 ) %>%
  ggplot(aes(x = m_all_rs)) +
  geom_point(aes(y = minMean), color = "green") +
  geom_point(aes(y = maxMean), color = "red") +
  geom_point(aes(y = mpMean), color = "black") +
  geom_abline(slope = 1, intercept = 0, linetype = 5) +
  coord_fixed(ratio = 1) +
  xlab("Actual reported mean") +
  ylab("Max/Min possible mean, and Ave")
```

## CDF Spline means

```{r}
library(binsmooth)
bbMean <- numeric(nrow(mth6))
for(i in seq(nrow(mth6))) {
  bb <- boundedbins(binEdges, as.numeric(mth6[i, 4:7]))
  bbMean[i] <- bb$est_mean
}
```

```{r}
mth6 %>% add_column(bbMean) %>%
  ggplot(aes(x = m_all_rs, y = bbMean)) +
  geom_point() + 
  geom_abline(slope = 1, intercept = 0, linetype = 5, color = "red") +
  coord_fixed(ratio = 1) +
  xlab("Actual reported mean") +
  ylab("CDF splined mean") 
```

```{r}
mth6 %>% mutate(relError = bbMean - m_all_rs) %>%
  ggplot(aes(x = m_all_rs, y = relError)) +
  geom_point() + 
  geom_abline(slope = 1, intercept = 0) +
  coord_fixed(ratio = 1) +
  xlab("Actual reported mean") +
  ylab("Error: CDF splined mean - actual") +
  geom_hline(yintercept = 0, color = "red", linetype = 5)
```


```{r}
mth6 %>% add_column(bbMean) %>%
  mutate(mpMean = (mp[1]*m_all_unsatgl_nm + mp[2]*m_all_approgl_nm + 
                     mp[3]*m_all_meetsgl_nm + mp[4]*m_all_mastrgl_nm)/
           (m_all_unsatgl_nm + m_all_approgl_nm + m_all_meetsgl_nm + m_all_mastrgl_nm)) %>%
  ggplot(aes(x = mpMean, y = bbMean)) +
  geom_point()
```


```{r}
binCounts <- as.numeric(mth6[11, 4:7])
binHeights <- c(0, binCounts/diff(binEdges))/(sum(binCounts))
bb <- boundedbins(binEdges, binCounts)
tibble(x = binEdges, y = binHeights) %>%
  ggplot(aes(x = x)) + 
    geom_function(fun = bb$boundedPDF) +
    geom_step(aes(y = y), color = "red", direction = "vh")
```

```{r}
bb$est_mean
mth6$m_all_rs[11]
```

## Summarize all districts in sample

```{r}
mth6 %>% summarize(bin1 = sum(m_all_unsatgl_nm),
                   bin2 = sum(m_all_approgl_nm),
                   bin3 = sum(m_all_meetsgl_nm),
                   bin4 = sum(m_all_mastrgl_nm)) %>%
  as.numeric() -> allBinCounts
bbAll <- boundedbins(binEdges, allBinCounts)
allBinHeights <- c(0, allBinCounts/diff(binEdges))/(sum(allBinCounts))
tibble(x = binEdges, y = allBinHeights) %>%
  ggplot(aes(x = x)) + 
    geom_function(fun = bbAll$boundedPDF) +
    geom_step(aes(y = y), color = "red", direction = "vh")
```
```{r}
math6hist <- read_delim("math62018.csv", ";", escape_double = FALSE, trim_ws = TRUE)
```