231 min read

An Introduction to Generalized Linear Models (4th edition)

Chapter2 Model Fitting

library(dobson)
library(ggprism)
library(tidyverse)
birthweight
## # A tibble: 12 × 4
##    `boys gestational age` `boys weight` `girls gestational age` `girls weight`
##                     <dbl>         <dbl>                   <dbl>          <dbl>
##  1                     40          2968                      40           3317
##  2                     38          2795                      36           2729
##  3                     40          3163                      40           2935
##  4                     35          2925                      38           2754
##  5                     36          2625                      42           3210
##  6                     37          2847                      39           2817
##  7                     41          3292                      40           3126
##  8                     40          3473                      37           2539
##  9                     37          2628                      36           2412
## 10                     38          3176                      38           2991
## 11                     40          3421                      39           2875
## 12                     38          2975                      40           3231
dim(birthweight)
## [1] 12  4
library(tidyverse)
birthweight |> ggplot(aes(x=`boys gestational age`,
                          y=`boys weight`)) + geom_point(shape=1, size=3) + 
  geom_point(aes(x=`girls gestational age`, y=`girls weight`), shape=19, size=3) +
  theme_bw() + theme(
  # Hide panel borders and remove grid lines
  #panel.border = element_line(colour = "black"),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  # Change axis line
  #axis.line = element_line(colour = "black")
) + xlab("Gestational age (weeks)") +
  ylab("Birthweight (grams)")

birthweight |> summarise(boys_sum_x=sum(`boys gestational age`),
                         boys_sum_y=sum(`boys weight`),
                         boys_sum_x2=sum((`boys gestational age`)^2),
                         boys_sum_y2=sum((`boys weight`)^2),
                         boys_sum_xy=sum((`boys gestational age`)*(`boys weight`)),
                         girls_sum_x=sum(`girls gestational age`),
                         girls_sum_y=sum(`girls weight`),
                         girls_sum_x2=sum((`girls gestational age`)^2),
                         girls_sum_y2=sum((`girls weight`)^2),
                         girls_sum_xy=sum((`girls gestational age`)*(`girls weight`))) |> 
  pivot_longer(
    cols = starts_with(c("boys", "girls")), 
    names_to = "items", 
    values_to = "values",
    values_drop_na = TRUE
  )
## # A tibble: 10 × 2
##    items           values
##    <chr>            <dbl>
##  1 boys_sum_x         460
##  2 boys_sum_y       36288
##  3 boys_sum_x2      17672
##  4 boys_sum_y2  110623496
##  5 boys_sum_xy    1395370
##  6 girls_sum_x        465
##  7 girls_sum_y      34936
##  8 girls_sum_x2     18055
##  9 girls_sum_y2 102575468
## 10 girls_sum_xy   1358497
birthweight_summary <- birthweight |> summarise(boys_sum_x=sum(`boys gestational age`),
                         boys_sum_y=sum(`boys weight`),
                         boys_sum_x2=sum((`boys gestational age`)^2),
                         boys_sum_y2=sum((`boys weight`)^2),
                         boys_sum_xy=sum((`boys gestational age`)*(`boys weight`)),
                         girls_sum_x=sum(`girls gestational age`),
                         girls_sum_y=sum(`girls weight`),
                         girls_sum_x2=sum((`girls gestational age`)^2),
                         girls_sum_y2=sum((`girls weight`)^2),
                         girls_sum_xy=sum((`girls gestational age`)*(`girls weight`)))
birthweight_summary
## # A tibble: 1 × 10
##   boys_sum_x boys_sum_y boys_sum_x2 boys_sum_y2 boys_sum_xy girls_sum_x
##        <dbl>      <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
## 1        460      36288       17672   110623496     1395370         465
## # ℹ 4 more variables: girls_sum_y <dbl>, girls_sum_x2 <dbl>,
## #   girls_sum_y2 <dbl>, girls_sum_xy <dbl>
#Table2.5
K=dim(birthweight)[1]
birthweight_summary |> summarise(b=(K*sum(boys_sum_xy,girls_sum_xy)-(sum(boys_sum_x*boys_sum_y, girls_sum_x*girls_sum_y)))/(K*sum(boys_sum_x2, girls_sum_x2)-(sum(boys_sum_x^2, girls_sum_x^2))),
                                 a1=boys_sum_y/K-b*boys_sum_x/K,
                                 a2=girls_sum_y/K-b*girls_sum_x/K,
                                 b1=(K*boys_sum_xy-boys_sum_x*boys_sum_y)/(K*boys_sum_x2-boys_sum_x^2),
                                 b2=(K*girls_sum_xy-girls_sum_x*girls_sum_y)/(K*girls_sum_x2-girls_sum_x^2),
                                 a11=boys_sum_y/K-b1*boys_sum_x/K,
                                 a22=girls_sum_y/K-b2*girls_sum_x/K,
                                 S0=sum((birthweight$`boys weight`-a1-b*birthweight$`boys gestational age`)^2,
                                        (birthweight$`girls weight`-a2-b*birthweight$`girls gestational age`)^2),
                                 S1=sum((birthweight$`boys weight`-a11-b1*birthweight$`boys gestational age`)^2,
                                        (birthweight$`girls weight`-a22-b2*birthweight$`girls gestational age`)^2),
                                Fstatistic=((S0-S1)/(2-1))/(S1/(24-4)),
                                p_value=pf(Fstatistic, df1=1, df2=20)
                                  ) |> 
  as.data.frame()
##          b        a1        a2       b1    b2       a11       a22       S0
## 1 120.8943 -1610.283 -1773.322 111.9828 130.4 -1268.672 -2141.667 658770.7
##         S1 Fstatistic   p_value
## 1 652424.5  0.1945428 0.3361066

2.5 Exercises

2.5.1

Genetically similar seeds are randomly assigned to be raised in either a nutritionally enriched environment (treatment group) or standard conditions (control group) using a completely randomized experimental design. After a predetermined time all plants are harvested, dried and weighed. The results, expressed in grams.

plants
## # A tibble: 20 × 2
##    treatment control
##        <dbl>   <dbl>
##  1      4.81    4.17
##  2      4.17    3.05
##  3      4.41    5.18
##  4      3.59    4.01
##  5      5.87    6.11
##  6      3.83    4.1 
##  7      6.03    5.17
##  8      4.98    3.57
##  9      4.9     5.33
## 10      5.75    5.59
## 11      5.36    4.66
## 12      3.48    5.58
## 13      4.69    3.66
## 14      4.44    4.5 
## 15      4.89    3.9 
## 16      4.71    4.61
## 17      5.48    5.62
## 18      4.32    4.53
## 19      5.15    6.05
## 20      6.34    5.14

Perform an unpaired t-test on these data and calculate a 95% confidence interval for the difference between the group means. Interpret these results.

#unpaired t-test
t1 <- t.test(plants$treatment, plants$control, var.equal = TRUE, data=plants)
t1
## 
## 	Two Sample t-test
## 
## data:  plants$treatment and plants$control
## t = 0.50985, df = 38, p-value = 0.6131
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3965733  0.6635733
## sample estimates:
## mean of x mean of y 
##    4.8600    4.7265
t1$conf.int
## [1] -0.3965733  0.6635733
## attr(,"conf.level")
## [1] 0.95
#t
sp <- sqrt((19*var(plants$treatment)+19*var(plants$control))/38)
(mean(plants$treatment)-mean(plants$control))/(sp*sqrt(2/20))
## [1] 0.5098476
#p-value
2*(1-pt(0.5098476, df=38))
## [1] 0.6131068
mean(plants$treatment)
## [1] 4.86
mean(plants$control)
## [1] 4.7265
sum(plants$treatment+plants$control)/40
## [1] 4.79325
mean(c(plants$treatment, plants$control))
## [1] 4.79325
c(plants$treatment, plants$control)
##  [1] 4.81 4.17 4.41 3.59 5.87 3.83 6.03 4.98 4.90 5.75 5.36 3.48 4.69 4.44 4.89
## [16] 4.71 5.48 4.32 5.15 6.34 4.17 3.05 5.18 4.01 6.11 4.10 5.17 3.57 5.33 5.59
## [31] 4.66 5.58 3.66 4.50 3.90 4.61 5.62 4.53 6.05 5.14
#S1 and S0
S1 <- sum((plants$treatment-mean(plants$treatment))^2) + sum((plants$control-mean(plants$control))^2)
S0 <- sum((c(plants$treatment, plants$control)-mean(c(plants$treatment, plants$control)))^2)
S1
## [1] 26.05345
S0
## [1] 26.23168
#F(1,38)
(S0-S1)/(S1/38)
## [1] 0.2599446
t <- (mean(plants$treatment)-mean(plants$control))/(sp*sqrt(2/20))
t^2
## [1] 0.2599446

2.5.2

The weights, in kilograms, of twenty men before and after participation in a “waist loss” program are shown in Table 2.8 (Egger et al. 1999). We want to know if, on average, they retain a weight loss twelve months after the program.

waist
## # A tibble: 20 × 3
##      man before after
##    <dbl>  <dbl> <dbl>
##  1     1   101.  97  
##  2     2   102  108. 
##  3     3   106.  97  
##  4     4   108  108  
##  5     5    92   84  
##  6     6   117. 112. 
##  7     7   110. 102. 
##  8     8   135  128. 
##  9     9   124. 118. 
## 10    10    95   94.2
## 11    11   105  105  
## 12    12    85   82.4
## 13    13   107.  98.2
## 14    14    80   83.6
## 15    15   115. 115  
## 16    16   104. 103  
## 17    17    82   80  
## 18    18   102. 102. 
## 19    19   104. 103. 
## 20    20    93   93
#unpaired t-test
t1 <- t.test(waist$before, waist$after, var.equal = TRUE, data=waist)
t1
## 
## 	Two Sample t-test
## 
## data:  waist$before and waist$after
## t = 0.64309, df = 38, p-value = 0.524
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.68128 10.97128
## sample estimates:
## mean of x mean of y 
##   103.245   100.600
#Dk
Dk <- waist$before - waist$after
EDk <- mean(Dk)
EDk
## [1] 2.645
#H0 is EDk = 0 and Dk ~ N(0, \sigma^2); H1 is EDk != 0 and Dk ~ N(EDk, \sigma^2)

S0 <- sum(Dk^2)
S1 <- sum((Dk-EDk)^2)
#F(1,19)
(S0-S1)/(S1/19)
## [1] 8.256448
Fstatistic <- (S0-S1)/(S1/19)
1-pf(Fstatistic, df1=1, df2=19)
## [1] 0.009730463
#paired t-test
t2 <- t.test(waist$before, waist$after, paired = TRUE, var.equal = TRUE, data=waist)
t2
## 
## 	Paired t-test
## 
## data:  waist$before and waist$after
## t = 2.8734, df = 19, p-value = 0.00973
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  0.718348 4.571652
## sample estimates:
## mean difference 
##           2.645
(t2$statistic)^2
##        t 
## 8.256448

Chapter3 Exponential Family and Generalized Linear Models

Exercises

3.6

mortality
## # A tibble: 8 × 3
##   `age group` deaths population
##   <chr>        <dbl>      <dbl>
## 1 30-34            1      17742
## 2 35-39            5      16554
## 3 40-44            5      16059
## 4 45-49           12      13083
## 5 50-54           25      10784
## 6 55-59           38       9645
## 7 60-64           54      10706
## 8 65-69           65       9933
library(dplyr)
mortality2 <- mortality %>%
    mutate(
        age=as.integer(stringr::str_split(as.character(`age group`), "-") %>% purrr::map_chr(., 1)),
        death_rate = deaths*100000 / population,
        log_age=log(age),
        log_deaths=log(death_rate)
    )
mortality2
## # A tibble: 8 × 7
##   `age group` deaths population   age death_rate log_age log_deaths
##   <chr>        <dbl>      <dbl> <int>      <dbl>   <dbl>      <dbl>
## 1 30-34            1      17742    30       5.64    3.40       1.73
## 2 35-39            5      16554    35      30.2     3.56       3.41
## 3 40-44            5      16059    40      31.1     3.69       3.44
## 4 45-49           12      13083    45      91.7     3.81       4.52
## 5 50-54           25      10784    50     232.      3.91       5.45
## 6 55-59           38       9645    55     394.      4.01       5.98
## 7 60-64           54      10706    60     504.      4.09       6.22
## 8 65-69           65       9933    65     654.      4.17       6.48
lmodel <- lm(log_deaths ~ log_age, data = mortality2)
lmodel
## 
## Call:
## lm(formula = log_deaths ~ log_age, data = mortality2)
## 
## Coefficients:
## (Intercept)      log_age  
##     -18.909        6.152
plot(mortality2$log_age, mortality2$log_deaths)

summary(lmodel)
## 
## Call:
## lm(formula = log_deaths ~ log_age, data = mortality2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34631 -0.28614 -0.02311  0.24654  0.44482 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -18.9094     1.7376  -10.88 3.57e-05 ***
## log_age       6.1520     0.4527   13.59 9.85e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3222 on 6 degrees of freedom
## Multiple R-squared:  0.9685,	Adjusted R-squared:  0.9633 
## F-statistic: 184.7 on 1 and 6 DF,  p-value: 9.854e-06
exp(lmodel$fitted.values)*mortality2$population/100000
##         1         2         3         4         5         6         7         8 
##  1.330546  3.204702  7.069199 11.886386 18.733671 30.115732 57.094466 86.677030
mortality2$deaths
## [1]  1  5  5 12 25 38 54 65
data(poisson)
poisson
## # A tibble: 9 × 2
##       y     x
##   <dbl> <dbl>
## 1     2    -1
## 2     3    -1
## 3     6     0
## 4     7     0
## 5     8     0
## 6     9     0
## 7    10     1
## 8    12     1
## 9    15     1
res.p=glm(y~x,family=poisson(link="identity"),data=poisson)
summary(res.p)
## 
## Call:
## glm(formula = y ~ x, family = poisson(link = "identity"), data = poisson)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   7.4516     0.8841   8.428  < 2e-16 ***
## x             4.9353     1.0892   4.531 5.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 18.4206  on 8  degrees of freedom
## Residual deviance:  1.8947  on 7  degrees of freedom
## AIC: 40.008
## 
## Number of Fisher Scoring iterations: 3

Chapter4 Estimation

#table 4.1 
data(failure)
failure
## # A tibble: 49 × 1
##    lifetimes
##        <dbl>
##  1      1051
##  2      1337
##  3      1389
##  4      1921
##  5      1942
##  6      2322
##  7      3629
##  8      4006
##  9      4012
## 10      4063
## 11      4921
## 12      5445
## 13      5620
## 14      5817
## 15      5905
## 16      5956
## 17      6068
## 18      6121
## 19      6473
## 20      7501
## 21      7886
## 22      8108
## 23      8546
## 24      8666
## 25      8831
## # ℹ 24 more rows
hist(failure$lifetimes,freq=T,col="grey",breaks=12)

library(tidyverse)
length(failure$lifetimes)
## [1] 49
failure$percent <- order(failure$lifetimes)*100/length(failure$lifetimes)
failure |> ggplot(aes(x=lifetimes, y=percent)) + geom_point(size=3) +
  scale_x_log10() + scale_y_log10() + theme_bw()

weibull <- tibble(
  x=seq(1000, 20000, by=1),
  data = dweibull(seq(1000, 20000, by=1),shape=3))
weibull$percent <- order(weibull$data)*100/length(weibull$data)
head(weibull)
## # A tibble: 6 × 3
##       x  data percent
##   <dbl> <dbl>   <dbl>
## 1  1000     0 0.00526
## 2  1001     0 0.0105 
## 3  1002     0 0.0158 
## 4  1003     0 0.0211 
## 5  1004     0 0.0263 
## 6  1005     0 0.0316
weibull |> ggplot(aes(x=x, y=percent)) + geom_point(size=1) + theme_bw()

y <- failure$lifetimes
N <- length(failure$lifetimes)
shape <- 2
forU <- function(theta){
  1e6*(-(shape*N)/(theta)+(shape*sum(y^2))/(theta^3))
}
forUprime <- function(theta){
  1e6*((shape*N)/(theta^2)-(shape*3*sum(y^2))/(theta^4))
}
forEUprime <- function(theta){
  1e6*(-(shape^2*N)/(theta^2))
}
nextTheta <- function(theta, U, Up){
  theta - U/Up
}
theta1 <- mean(failure$lifetimes)
U10e6_1 <- forU(theta1)
Uprime10e6_1 <- forUprime(theta1)
EUprime10e6_1 <- forEUprime(theta1)
newTheta <- nextTheta(theta1, U10e6_1/1e6, Uprime10e6_1/1e6)
theta1
## [1] 8805.694
U10e6_1
## [1] 2915.751
Uprime10e6_1
## [1] -3.521083
EUprime10e6_1
## [1] -2.52772
U10e6_1/Uprime10e6_1
## [1] -828.0835
U10e6_1/EUprime10e6_1
## [1] -1153.51
newTheta
## [1] 9633.777
library(tidyverse)
tibble("theta" = theta1, 
       "U10e6" = U10e6_1, 
       "Uprime10e6" = Uprime10e6_1, 
       "EUprime10e6" = EUprime10e6_1, 
       "U/Uprime" = U10e6_1/Uprime10e6_1, 
       "U/EUprime" = U10e6_1/EUprime10e6_1
       ) %>% 
  add_row("theta" = nextTheta(last(.$theta), last(.$U10e6)/1e6, last(.$Uprime10e6)/1e6),
          "U10e6" = forU(theta), 
       "Uprime10e6" = forUprime(theta), 
       "EUprime10e6" = forEUprime(theta), 
       "U/Uprime" = U10e6/Uprime10e6, 
       "U/EUprime" = U10e6/EUprime10e6
       ) %>% 
  add_row("theta" = nextTheta(last(.$theta), last(.$U10e6)/1e6, last(.$Uprime10e6)/1e6),
          "U10e6" = forU(theta), 
       "Uprime10e6" = forUprime(theta), 
       "EUprime10e6" = forEUprime(theta), 
       "U/Uprime" = U10e6/Uprime10e6, 
       "U/EUprime" = U10e6/EUprime10e6
       ) %>% 
  add_row("theta" = nextTheta(last(.$theta), last(.$U10e6)/1e6, last(.$Uprime10e6)/1e6),
          "U10e6" = forU(theta), 
       "Uprime10e6" = forUprime(theta), 
       "EUprime10e6" = forEUprime(theta), 
       "U/Uprime" = U10e6/Uprime10e6, 
       "U/EUprime" = U10e6/EUprime10e6
       ) %>% 
  add_row("theta" = nextTheta(last(.$theta), last(.$U10e6)/1e6, last(.$Uprime10e6)/1e6),
          "U10e6" = forU(theta), 
       "Uprime10e6" = forUprime(theta), 
       "EUprime10e6" = forEUprime(theta), 
       "U/Uprime" = U10e6/Uprime10e6, 
       "U/EUprime" = U10e6/EUprime10e6
       )
## # A tibble: 5 × 6
##   theta         U10e6 Uprime10e6 EUprime10e6    `U/Uprime` `U/EUprime`
##   <dbl>         <dbl>      <dbl>       <dbl>         <dbl>       <dbl>
## 1 8806. 2916.              -3.52       -2.53 -828.            -1.15e+3
## 2 9634.  553.              -2.28       -2.11 -242.            -2.62e+2
## 3 9876.   32.7             -2.02       -2.01  -16.2           -1.63e+1
## 4 9892.    0.134           -2.00       -2.00   -0.0668        -6.68e-2
## 5 9892.    0.00000226      -2.00       -2.00   -0.00000113    -1.13e-6
library(dobson)
data(poisson)
poisson
## # A tibble: 9 × 2
##       y     x
##   <dbl> <dbl>
## 1     2    -1
## 2     3    -1
## 3     6     0
## 4     7     0
## 5     8     0
## 6     9     0
## 7    10     1
## 8    12     1
## 9    15     1
res.p=glm(y~x,family=poisson(link="identity"),data=poisson)
summary(res.p)
## 
## Call:
## glm(formula = y ~ x, family = poisson(link = "identity"), data = poisson)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   7.4516     0.8841   8.428  < 2e-16 ***
## x             4.9353     1.0892   4.531 5.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 18.4206  on 8  degrees of freedom
## Residual deviance:  1.8947  on 7  degrees of freedom
## AIC: 40.008
## 
## Number of Fisher Scoring iterations: 3
aids
## # A tibble: 20 × 3
##     year quarter cases
##    <dbl>   <dbl> <dbl>
##  1  1984       1     1
##  2  1984       2     6
##  3  1984       3    16
##  4  1984       4    23
##  5  1985       1    27
##  6  1985       2    39
##  7  1985       3    31
##  8  1985       4    30
##  9  1986       1    43
## 10  1986       2    51
## 11  1986       3    63
## 12  1986       4    70
## 13  1987       1    88
## 14  1987       2    97
## 15  1987       3    91
## 16  1987       4   104
## 17  1988       1   110
## 18  1988       2   113
## 19  1988       3   149
## 20  1988       4   159
aids |> ggplot(aes(x=seq(1,20, by=1),
                   y=cases)) + geom_point(size=2) + theme_bw()

ggplot(aes(x=log(seq(1,20, by=1)), 
           y=log(cases)),
           data=aids) + geom_point(size=2) + theme_bw()

aids$logx <- log(seq(1,20, by=1))
res.p=glm(cases~logx,family=poisson(link="log"),data=aids)
summary(res.p)
## 
## Call:
## glm(formula = cases ~ logx, family = poisson(link = "log"), data = aids)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.99600    0.16971   5.869 4.39e-09 ***
## logx         1.32661    0.06463  20.525  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 677.264  on 19  degrees of freedom
## Residual deviance:  21.755  on 18  degrees of freedom
## AIC: 138.05
## 
## Number of Fisher Scoring iterations: 4
beta1 = 1
beta2 = 1
beta = matrix(c(beta1, beta2))
beta
##      [,1]
## [1,]    1
## [2,]    1
X = matrix(c(rep(1,20), aids$logx),
           ncol = 2,
           byrow = FALSE)
X
##       [,1]      [,2]
##  [1,]    1 0.0000000
##  [2,]    1 0.6931472
##  [3,]    1 1.0986123
##  [4,]    1 1.3862944
##  [5,]    1 1.6094379
##  [6,]    1 1.7917595
##  [7,]    1 1.9459101
##  [8,]    1 2.0794415
##  [9,]    1 2.1972246
## [10,]    1 2.3025851
## [11,]    1 2.3978953
## [12,]    1 2.4849066
## [13,]    1 2.5649494
## [14,]    1 2.6390573
## [15,]    1 2.7080502
## [16,]    1 2.7725887
## [17,]    1 2.8332133
## [18,]    1 2.8903718
## [19,]    1 2.9444390
## [20,]    1 2.9957323
library(matlib)
Y=aids$cases
W = as.vector(exp(X %*% beta))
XWX = t(X) %*% diag(W) %*% X
XWX
##           [,1]     [,2]
## [1,]  570.8392 1439.608
## [2,] 1439.6080 3768.835
Z = X %*% beta + Y/exp(X %*% beta) - 1
XWz = t(X) %*% diag(W) %*% Z
XWz
##          [,1]
## [1,] 2750.608
## [2,] 7165.214
beta = inv(XWX) %*% XWz
beta
##          [,1]
## [1,] 0.652354
## [2,] 1.651976
W = as.vector(exp(X %*% beta))
XWX = t(X) %*% diag(W) %*% X
XWX
##          [,1]      [,2]
## [1,] 2179.160  5759.537
## [2,] 5759.537 15530.924
Z = X %*% beta + Y/exp(X %*% beta) - 1
XWz = t(X) %*% diag(W) %*% Z
XWz
##          [,1]
## [1,] 10068.04
## [2,] 27050.81
beta = inv(XWX) %*% XWz
beta
##           [,1]
## [1,] 0.8418285
## [2,] 1.4296516
W = as.vector(exp(X %*% beta))
XWX = t(X) %*% diag(W) %*% X
XWX
##          [,1]      [,2]
## [1,] 1468.897  3831.947
## [2,] 3831.947 10243.200
Z = X %*% beta + Y/exp(X %*% beta) - 1
XWz = t(X) %*% diag(W) %*% Z
XWz
##           [,1]
## [1,]  6557.012
## [2,] 17434.482
beta = inv(XWX) %*% XWz
beta
##          [,1]
## [1,] 0.984507
## [2,] 1.333712
W = as.vector(exp(X %*% beta))
XWX = t(X) %*% diag(W) %*% X
XWX
##          [,1]     [,2]
## [1,] 1320.094 3421.644
## [2,] 3421.644 9108.435
Z = X %*% beta + Y/exp(X %*% beta) - 1
XWz = t(X) %*% diag(W) %*% Z
XWz
##           [,1]
## [1,]  5854.034
## [2,] 15491.392
beta = inv(XWX) %*% XWz
beta
##           [,1]
## [1,] 0.9959001
## [2,] 1.3266773
W = as.vector(exp(X %*% beta))
XWX = t(X) %*% diag(W) %*% X
XWX
##          [,1]     [,2]
## [1,] 1311.101 3396.658
## [2,] 3396.658 9039.070
Z = X %*% beta + Y/exp(X %*% beta) - 1
XWz = t(X) %*% diag(W) %*% Z
XWz
##           [,1]
## [1,]  5811.893
## [2,] 15374.382
beta = inv(XWX) %*% XWz
beta
##           [,1]
## [1,] 0.9959791
## [2,] 1.3265970
leukemia
## # A tibble: 17 × 2
##     time   wbc
##    <dbl> <dbl>
##  1    65  3.36
##  2   156  2.88
##  3   100  3.63
##  4   134  3.41
##  5    16  3.78
##  6   108  4.02
##  7   121  4   
##  8     4  4.23
##  9    39  3.73
## 10   143  3.85
## 11    56  3.97
## 12    26  4.51
## 13    22  4.54
## 14     1  5   
## 15     1  5   
## 16     5  4.72
## 17    65  5
leukemia |> ggplot(aes(x=wbc, y=time))+geom_point(size=2) + theme_bw()

res.e=glm(time~wbc,family=Gamma(link="log"), data=leukemia)
summary(res.e, dispersion=1)
## 
## Call:
## glm(formula = time ~ wbc, family = Gamma(link = "log"), data = leukemia)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   8.4775     1.6548   5.123 3.01e-07 ***
## wbc          -1.1093     0.3997  -2.776  0.00551 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 1)
## 
##     Null deviance: 26.282  on 16  degrees of freedom
## Residual deviance: 19.457  on 15  degrees of freedom
## AIC: 173.97
## 
## Number of Fisher Scoring iterations: 8
beta1 = 1
beta2 = 1
beta = matrix(c(beta1, beta2))
beta
##      [,1]
## [1,]    1
## [2,]    1
X = matrix(c(rep(1,length(leukemia$wbc)), leukemia$wbc),
           ncol = 2,
           byrow = FALSE)
X
##       [,1] [,2]
##  [1,]    1 3.36
##  [2,]    1 2.88
##  [3,]    1 3.63
##  [4,]    1 3.41
##  [5,]    1 3.78
##  [6,]    1 4.02
##  [7,]    1 4.00
##  [8,]    1 4.23
##  [9,]    1 3.73
## [10,]    1 3.85
## [11,]    1 3.97
## [12,]    1 4.51
## [13,]    1 4.54
## [14,]    1 5.00
## [15,]    1 5.00
## [16,]    1 4.72
## [17,]    1 5.00
library(matlib)
Y=leukemia$time

for (i in 1:8) {
  W = rep(1,length(leukemia$wbc))
  XWX = t(X) %*% diag(W) %*% X
  print(paste0("iter_", i))
  print(XWX)
  Z = X %*% beta + (Y-exp(X %*% beta))/exp(X %*% beta)
  XWz = t(X) %*% diag(W) %*% Z
  print(XWz)
  beta = inv(XWX) %*% XWz
  print(beta)
  writeLines("\n")
}
## [1] "iter_1"
##       [,1]     [,2]
## [1,] 17.00  69.6300
## [2,] 69.63 291.4571
##           [,1]
## [1,]  80.19729
## [2,] 328.25994
##             [,1]
## [1,]  4.86057632
## [2,] -0.03493351
## 
## 
## [1] "iter_2"
##       [,1]     [,2]
## [1,] 17.00  69.6300
## [2,] 69.63 291.4571
##           [,1]
## [1,]  72.57509
## [2,] 293.84440
##            [,1]
## [1,]  6.5030112
## [2,] -0.5453974
## 
## 
## [1] "iter_3"
##       [,1]     [,2]
## [1,] 17.00  69.6300
## [2,] 69.63 291.4571
##           [,1]
## [1,]  68.40056
## [2,] 274.35080
##            [,1]
## [1,]  7.8244252
## [2,] -0.9279706
## 
## 
## [1] "iter_4"
##       [,1]     [,2]
## [1,] 17.00  69.6300
## [2,] 69.63 291.4571
##           [,1]
## [1,]  67.03087
## [2,] 267.77587
##           [,1]
## [1,]  8.375058
## [2,] -1.082077
## 
## 
## [1] "iter_5"
##       [,1]     [,2]
## [1,] 17.00  69.6300
## [2,] 69.63 291.4571
##           [,1]
## [1,]  66.87989
## [2,] 266.99995
##           [,1]
## [1,]  8.469218
## [2,] -1.107235
## 
## 
## [1] "iter_6"
##       [,1]     [,2]
## [1,] 17.00  69.6300
## [2,] 69.63 291.4571
##           [,1]
## [1,]  66.87706
## [2,] 266.97632
##           [,1]
## [1,]  8.476937
## [2,] -1.109160
## 
## 
## [1] "iter_7"
##       [,1]     [,2]
## [1,] 17.00  69.6300
## [2,] 69.63 291.4571
##           [,1]
## [1,]  66.87705
## [2,] 266.97547
##           [,1]
## [1,]  8.477461
## [2,] -1.109288
## 
## 
## [1] "iter_8"
##       [,1]     [,2]
## [1,] 17.00  69.6300
## [2,] 69.63 291.4571
##           [,1]
## [1,]  66.87705
## [2,] 266.97541
##           [,1]
## [1,]  8.477495
## [2,] -1.109296

Chapter5 Inference

#Exercises 5.1
#critical value
qchisq(0.95,df=1)
## [1] 3.841459
N=10
y=3
pi=c(0.1,0.3,0.5)
#information
J=N/(pi*(1-pi))
#Wald/score statistic
Wald = (y-N*pi)^2/(N*pi*(1-pi))
#Deviance
pi_hat=y/N
D=2*(y*log(pi_hat/pi)+(N-y)*log((1-pi_hat)/(1-pi)))
Wald
## [1] 4.444444 0.000000 1.600000
D
## [1] 3.073272 0.000000 1.645658
#Exercises 5.3(d)
N=100
theta_hat=c()
for(i in 1:20){
  N=100
  U = runif(N, min = 0, max = 1)
  theta=2
  Y=(1/U)^(1/theta)
  theta_h = N/(sum(log(Y)))
  print(theta_h)
  theta_hat=c(theta_hat, theta_h)
}
## [1] 2.191289
## [1] 1.973575
## [1] 2.269228
## [1] 1.936563
## [1] 1.858015
## [1] 1.794424
## [1] 2.046288
## [1] 1.925291
## [1] 2.057207
## [1] 2.514523
## [1] 2.192137
## [1] 1.958794
## [1] 2.131131
## [1] 1.581678
## [1] 1.819127
## [1] 1.915695
## [1] 1.902304
## [1] 1.735085
## [1] 2.007618
## [1] 2.057621
theta_hat
##  [1] 2.191289 1.973575 2.269228 1.936563 1.858015 1.794424 2.046288 1.925291
##  [9] 2.057207 2.514523 2.192137 1.958794 2.131131 1.581678 1.819127 1.915695
## [17] 1.902304 1.735085 2.007618 2.057621
quantile(theta_hat,  probs = c(2.5, 97.5)/100)
##     2.5%    97.5% 
## 1.654546 2.398008
library(dobson)
leukemia
## # A tibble: 17 × 2
##     time   wbc
##    <dbl> <dbl>
##  1    65  3.36
##  2   156  2.88
##  3   100  3.63
##  4   134  3.41
##  5    16  3.78
##  6   108  4.02
##  7   121  4   
##  8     4  4.23
##  9    39  3.73
## 10   143  3.85
## 11    56  3.97
## 12    26  4.51
## 13    22  4.54
## 14     1  5   
## 15     1  5   
## 16     5  4.72
## 17    65  5
res.e=glm(time~wbc,family=Gamma(link="log"), data=leukemia)
summary(res.e, dispersion=1)
## 
## Call:
## glm(formula = time ~ wbc, family = Gamma(link = "log"), data = leukemia)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   8.4775     1.6548   5.123 3.01e-07 ***
## wbc          -1.1093     0.3997  -2.776  0.00551 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 1)
## 
##     Null deviance: 26.282  on 16  degrees of freedom
## Residual deviance: 19.457  on 15  degrees of freedom
## AIC: 173.97
## 
## Number of Fisher Scoring iterations: 8
#95% confidence interval for the parameter b1.
-1.1093+0.3997*2
## [1] -0.3099
-1.1093-0.3997*2
## [1] -1.9087
res.e2=glm(time~1,family=Gamma(link="log"), data=leukemia)
summary(res.e2, dispersion=1)
## 
## Call:
## glm(formula = time ~ 1, family = Gamma(link = "log"), data = leukemia)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   4.1347     0.2425   17.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 1)
## 
##     Null deviance: 26.282  on 16  degrees of freedom
## Residual deviance: 26.282  on 16  degrees of freedom
## AIC: 178.09
## 
## Number of Fisher Scoring iterations: 6
#deviance difference
res.e2$deviance - res.e$deviance
## [1] 6.825567
diff = res.e2$deviance - res.e$deviance
#p value
(1-pchisq(diff, df=1))
## [1] 0.008986204

Chapter6 Normal Linear Models

carbohydrate
## # A tibble: 20 × 4
##    carbohydrate   age weight protein
##           <dbl> <dbl>  <dbl>   <dbl>
##  1           33    33    100      14
##  2           40    47     92      15
##  3           37    49    135      18
##  4           27    35    144      12
##  5           30    46    140      15
##  6           43    52    101      15
##  7           34    62     95      14
##  8           48    23    101      17
##  9           30    32     98      15
## 10           38    42    105      14
## 11           50    31    108      17
## 12           51    61     85      19
## 13           30    63    130      19
## 14           36    40    127      20
## 15           41    50    109      15
## 16           42    64    107      16
## 17           46    56    117      18
## 18           24    61    100      13
## 19           35    48    118      18
## 20           37    28    102      14
dput(carbohydrate)
## structure(list(carbohydrate = c(33, 40, 37, 27, 30, 43, 34, 48, 
## 30, 38, 50, 51, 30, 36, 41, 42, 46, 24, 35, 37), age = c(33, 
## 47, 49, 35, 46, 52, 62, 23, 32, 42, 31, 61, 63, 40, 50, 64, 56, 
## 61, 48, 28), weight = c(100, 92, 135, 144, 140, 101, 95, 101, 
## 98, 105, 108, 85, 130, 127, 109, 107, 117, 100, 118, 102), protein = c(14, 
## 15, 18, 12, 15, 15, 14, 17, 15, 14, 17, 19, 19, 20, 15, 16, 18, 
## 13, 18, 14)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
## -20L))
#Model 6.7
N=dim(carbohydrate)[1]
X <- carbohydrate
X[,1] <- rep(1, N)
X
## # A tibble: 20 × 4
##    carbohydrate   age weight protein
##           <dbl> <dbl>  <dbl>   <dbl>
##  1            1    33    100      14
##  2            1    47     92      15
##  3            1    49    135      18
##  4            1    35    144      12
##  5            1    46    140      15
##  6            1    52    101      15
##  7            1    62     95      14
##  8            1    23    101      17
##  9            1    32     98      15
## 10            1    42    105      14
## 11            1    31    108      17
## 12            1    61     85      19
## 13            1    63    130      19
## 14            1    40    127      20
## 15            1    50    109      15
## 16            1    64    107      16
## 17            1    56    117      18
## 18            1    61    100      13
## 19            1    48    118      18
## 20            1    28    102      14

Why \((y-Xb)^T(y-Xb)\ne y^Ty-b^TX^Ty\), where \(b=(X^TX)^{-1}X^Ty\) is the least squares estimate?

\[\begin{align} (y-Xb)^T(y-Xb)&=y^Ty-y^TXb-b^TX^Ty+b^TX^TXb\\ &=y^Ty-b^TX^Ty \end{align}\]

carbohydrate <- structure(list(carbohydrate = c(33, 40, 37, 27, 30, 43, 34, 48, 
30, 38, 50, 51, 30, 36, 41, 42, 46, 24, 35, 37), age = c(33, 
47, 49, 35, 46, 52, 62, 23, 32, 42, 31, 61, 63, 40, 50, 64, 56, 
61, 48, 28), weight = c(100, 92, 135, 144, 140, 101, 95, 101, 
98, 105, 108, 85, 130, 127, 109, 107, 117, 100, 118, 102), protein = c(14, 
15, 18, 12, 15, 15, 14, 17, 15, 14, 17, 19, 19, 20, 15, 16, 18, 
13, 18, 14)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-20L))

N=dim(carbohydrate)[1]
X <- carbohydrate
X[,1] <- rep(1, N)
y <- as.matrix(carbohydrate$carbohydrate)
#b=(X^TX)^{-1}X^Ty
X <- as.matrix(X)
XTX <- t(X) %*% X
XTy <- t(X) %*% y
XTX_inv <- solve(XTX)
b=XTX_inv %*% XTy

#Residual S
t(y - X %*% b) %*% (y - X %*% b)
##          [,1]
## [1,] 567.6629
#Residual S
t(y) %*% y - 2*(t(y) %*% X %*% b) + t(b) %*% XTX %*% b
##          [,1]
## [1,] 567.6629
#Residual S
t(y) %*% y - t(b) %*% XTy
##          [,1]
## [1,] 567.6629
#Model 6.7
y <- as.matrix(carbohydrate$carbohydrate)
#X^Ty
t(as.matrix(X)) %*% y
##               [,1]
## carbohydrate   752
## age          34596
## weight       82270
## protein      12105
#X^TX
t(as.matrix(X)) %*% as.matrix(X)
##              carbohydrate    age weight protein
## carbohydrate           20    923   2214     318
## age                   923  45697 102003   14780
## weight               2214 102003 250346   35306
## protein               318  14780  35306    5150
#b=(X^TX)^{-1}X^Ty
X <- as.matrix(X)
XTX <- t(X) %*% X
XTy <- t(X) %*% y
solve(XTX)
##              carbohydrate           age        weight       protein
## carbohydrate   4.81577219 -1.134935e-02 -1.875498e-02 -0.1362153330
## age           -0.01134935  3.368777e-04  1.887239e-05 -0.0003953918
## weight        -0.01875498  1.887239e-05  1.955259e-04 -0.0002365220
## protein       -0.13621533 -3.953918e-04 -2.365220e-04  0.0113613619
XTX_inv <- solve(XTX)
b=solve(XTX) %*% XTy
b
##                    [,1]
## carbohydrate 36.9600559
## age          -0.1136764
## weight       -0.2280174
## protein       1.9577126
t(y) %*% y
##       [,1]
## [1,] 29368
#Residual S
t(y - X %*% b) %*% (y - X %*% b)
##          [,1]
## [1,] 567.6629
#Residual S
t(y) %*% y - 2*(t(y) %*% X %*% b) + t(b) %*% XTX %*% b
##          [,1]
## [1,] 567.6629
#Residual
t(y) %*% y - t(b) %*% XTy
##          [,1]
## [1,] 567.6629
#R^2
(t(b) %*% XTX %*% b - N*mean(y)^2)/(t(y) %*% y - N*(mean(y)^2))
##           [,1]
## [1,] 0.4805428
(t(b) %*% XTy - N*mean(y)^2)/(t(y) %*% y - N*mean(y)^2)
##           [,1]
## [1,] 0.4805428
#fitted values
X %*% b
##           [,1]
##  [1,] 37.81498
##  [2,] 40.00536
##  [3,] 35.84640
##  [4,] 23.63943
##  [5,] 29.17420
##  [6,] 37.38482
##  [7,] 35.65845
##  [8,] 44.59686
##  [9,] 40.34240
## [10,] 35.65180
## [11,] 42.09133
## [12,] 47.84086
## [13,] 37.35273
## [14,] 42.60905
## [15,] 35.78803
## [16,] 36.61031
## [17,] 39.15497
## [18,] 32.67433
## [19,] 39.83637
## [20,] 37.92732
#unbiased estimator of \sigma^2:
p=4
(t(y-X %*% b) %*% (y-X %*% b))/(N-p)
##          [,1]
## [1,] 35.47893
sigma_sq <- (t(y-X %*% b) %*% (y-X %*% b))/(N-p)
#J^{-1}
diag(XTX_inv)
## carbohydrate          age       weight      protein 
## 4.8157721870 0.0003368777 0.0001955259 0.0113613619
sqrt(as.vector(sigma_sq) * diag(XTX_inv))
## carbohydrate          age       weight      protein 
##  13.07128293   0.10932548   0.08328895   0.63489286
#Model 6.7
N=dim(carbohydrate)[1]
X2 <- carbohydrate[,2:4]
X2[,1] <- rep(1, N)
X2
## # A tibble: 20 × 3
##      age weight protein
##    <dbl>  <dbl>   <dbl>
##  1     1    100      14
##  2     1     92      15
##  3     1    135      18
##  4     1    144      12
##  5     1    140      15
##  6     1    101      15
##  7     1     95      14
##  8     1    101      17
##  9     1     98      15
## 10     1    105      14
## 11     1    108      17
## 12     1     85      19
## 13     1    130      19
## 14     1    127      20
## 15     1    109      15
## 16     1    107      16
## 17     1    117      18
## 18     1    100      13
## 19     1    118      18
## 20     1    102      14
y <- as.matrix(carbohydrate$carbohydrate)
#b=(X^TX)^{-1}X^Ty
X2 <- as.matrix(X2)
XTX2 <- t(X2) %*% X2
XTy2 <- t(X2) %*% y
solve(XTX2)
##                 age        weight       protein
## age      4.43341460 -0.0181191691 -0.1495360119
## weight  -0.01811917  0.0001944686 -0.0002143716
## protein -0.14953601 -0.0002143716  0.0108972924
XTX_inv2 <- solve(XTX2)
b2=XTX_inv2 %*% XTy2
b2
##              [,1]
## age     33.130320
## weight  -0.221649
## protein  1.824291
t(y) %*% y
##       [,1]
## [1,] 29368
#Residual
t(y - X2 %*% b2) %*% (y - X2 %*% b2)
##          [,1]
## [1,] 606.0219
res.lm=lm(carbohydrate~age+weight+protein,data=carbohydrate)
summary(res.lm)
## 
## Call:
## lm(formula = carbohydrate ~ age + weight + protein, data = carbohydrate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3424  -4.8203   0.9897   3.8553   7.9087 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 36.96006   13.07128   2.828  0.01213 * 
## age         -0.11368    0.10933  -1.040  0.31389   
## weight      -0.22802    0.08329  -2.738  0.01460 * 
## protein      1.95771    0.63489   3.084  0.00712 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.956 on 16 degrees of freedom
## Multiple R-squared:  0.4805,	Adjusted R-squared:  0.3831 
## F-statistic: 4.934 on 3 and 16 DF,  p-value: 0.01297
res.lm=lm(carbohydrate~age+weight+protein,data=carbohydrate)
b2 <- res.lm$coefficients
N=dim(carbohydrate)[1]
X <- carbohydrate
X[,1] <- rep(1, N)
y <- as.matrix(carbohydrate$carbohydrate)
#b=(X^TX)^{-1}X^Ty
X <- as.matrix(X)

#Residual S
t(y - X %*% b2) %*% (y - X %*% b2)
##          [,1]
## [1,] 567.6629
#Residual S
t(y) %*% y - 2*(t(y) %*% X %*% b2) + t(b2) %*% XTX %*% b2
##          [,1]
## [1,] 567.6629
#Residual
t(y) %*% y - t(b2) %*% XTy
##          [,1]
## [1,] 567.6629
crossprod(X)
##              carbohydrate    age weight protein
## carbohydrate           20    923   2214     318
## age                   923  45697 102003   14780
## weight               2214 102003 250346   35306
## protein               318  14780  35306    5150
t(X) %*% X
##              carbohydrate    age weight protein
## carbohydrate           20    923   2214     318
## age                   923  45697 102003   14780
## weight               2214 102003 250346   35306
## protein               318  14780  35306    5150
res.lm$fitted.values
##        1        2        3        4        5        6        7        8 
## 37.81498 40.00536 35.84640 23.63943 29.17420 37.38482 35.65845 44.59686 
##        9       10       11       12       13       14       15       16 
## 40.34240 35.65180 42.09133 47.84086 37.35273 42.60905 35.78803 36.61031 
##       17       18       19       20 
## 39.15497 32.67433 39.83637 37.92732
#unbiased estimator of \sigma^2:
b2 <- res.lm$coefficients
p=4
sigma_sq <- (t(y-X %*% b2) %*% (y-X %*% b2))/(N-p)
sigma_hat <- sqrt(sigma_sq)
sigma_hat
##          [,1]
## [1,] 5.956419
#het matrix X(X^TX)^{-1}X^T
H <- X %*% XTX_inv %*% t(X)
diag(H)
##  [1] 0.14759018 0.11986321 0.19203771 0.49453706 0.23926883 0.08702051
##  [7] 0.22415190 0.29634508 0.14949497 0.09270349 0.15862094 0.34949329
## [13] 0.27032727 0.29021767 0.06653041 0.15639780 0.12027527 0.25560301
## [19] 0.10186193 0.18765947
hatvalues(res.lm)
##          1          2          3          4          5          6          7 
## 0.14759018 0.11986321 0.19203771 0.49453706 0.23926883 0.08702051 0.22415190 
##          8          9         10         11         12         13         14 
## 0.29634508 0.14949497 0.09270349 0.15862094 0.34949329 0.27032727 0.29021767 
##         15         16         17         18         19         20 
## 0.06653041 0.15639780 0.12027527 0.25560301 0.10186193 0.18765947
#standard deviation of residual
std_e <- as.vector(sigma_hat)*sqrt(1-hatvalues(res.lm))
std_e
##        1        2        3        4        5        6        7        8 
## 5.499326 5.588051 5.354030 4.234771 5.195183 5.691356 5.246547 4.996491 
##        9       10       11       12       13       14       15       16 
## 5.493178 5.673615 5.463628 4.804090 5.088026 5.018199 5.754868 5.470841 
##       17       18       19       20 
## 5.586742 5.139106 5.644907 5.368517
#Table 6.6
library(tidyverse)
tibble(carbohydrate = carbohydrate$carbohydrate, 
       Fitted_values = res.lm$fitted.values, 
       Residual = res.lm$residuals, 
       Std_residual = res.lm$residuals/std_e, 
       DFIT = Std_residual*sqrt(hatvalues(res.lm)/(1-hatvalues(res.lm))),
       cooks_distance = cooks.distance(res.lm))
## # A tibble: 20 × 6
##    carbohydrate Fitted_values  Residual Std_residual      DFIT cooks_distance
##           <dbl>         <dbl>     <dbl>        <dbl>     <dbl>          <dbl>
##  1           33          37.8  -4.81       -0.876    -0.364      0.0332      
##  2           40          40.0  -0.00536    -0.000959 -0.000354   0.0000000313
##  3           37          35.8   1.15        0.215     0.105      0.00276     
##  4           27          23.6   3.36        0.794     0.785      0.154       
##  5           30          29.2   0.826       0.159     0.0891     0.00199     
##  6           43          37.4   5.62        0.987     0.305      0.0232      
##  7           34          35.7  -1.66       -0.316    -0.170      0.00722     
##  8           48          44.6   3.40        0.681     0.442      0.0488      
##  9           30          40.3 -10.3        -1.88     -0.789      0.156       
## 10           38          35.7   2.35        0.414     0.132      0.00438     
## 11           50          42.1   7.91        1.45      0.629      0.0988      
## 12           51          47.8   3.16        0.658     0.482      0.0581      
## 13           30          37.4  -7.35       -1.45     -0.880      0.193       
## 14           36          42.6  -6.61       -1.32     -0.842      0.177       
## 15           41          35.8   5.21        0.906     0.242      0.0146      
## 16           42          36.6   5.39        0.985     0.424      0.0450      
## 17           46          39.2   6.85        1.23      0.453      0.0513      
## 18           24          32.7  -8.67       -1.69     -0.989      0.245       
## 19           35          39.8  -4.84       -0.857    -0.289      0.0208      
## 20           37          37.9  -0.927      -0.173    -0.0830     0.00172
res.lm1=lm(carbohydrate~weight+protein,data=carbohydrate)
summary(res.lm1)
## 
## Call:
## lm(formula = carbohydrate ~ weight + protein, data = carbohydrate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.6812  -3.9135   0.9464   4.0880   9.7948 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 33.13032   12.57155   2.635  0.01736 * 
## weight      -0.22165    0.08326  -2.662  0.01642 * 
## protein      1.82429    0.62327   2.927  0.00941 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.971 on 17 degrees of freedom
## Multiple R-squared:  0.4454,	Adjusted R-squared:  0.3802 
## F-statistic: 6.827 on 2 and 17 DF,  p-value: 0.006661

The R squared value ranges between 0 to 1 and is represented by the below formula:

\[R2= 1- SSres / SStot\]

anova(res.lm)
## Analysis of Variance Table
## 
## Response: carbohydrate
##           Df Sum Sq Mean Sq F value   Pr(>F)   
## age        1   3.82    3.82  0.1076 0.747136   
## weight     1 183.98  183.98  5.1856 0.036859 * 
## protein    1 337.34  337.34  9.5082 0.007121 **
## Residuals 16 567.66   35.48                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(res.lm1)
## Analysis of Variance Table
## 
## Response: carbohydrate
##           Df Sum Sq Mean Sq F value   Pr(>F)   
## weight     1 181.38 181.378   5.088 0.037560 * 
## protein    1 305.40 305.400   8.567 0.009409 **
## Residuals 17 606.02  35.648                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SStot OF res.lm
sum(anova(res.lm)$`Sum Sq`)
## [1] 1092.8
sum((carbohydrate$carbohydrate-mean(carbohydrate$carbohydrate))^2)
## [1] 1092.8
#SSres OF res.lm
sum(anova(res.lm)$`Sum Sq`[1:3])
## [1] 525.1371
sum((res.lm$fitted.values-mean(carbohydrate$carbohydrate))^2)
## [1] 525.1371
sum((res.lm$fitted.values-mean(res.lm$fitted.values))^2)
## [1] 525.1371
#SStot OF res.lm1
sum(anova(res.lm1)$`Sum Sq`)
## [1] 1092.8
sum((carbohydrate$carbohydrate-mean(carbohydrate$carbohydrate))^2)
## [1] 1092.8
#SSres OF res.lm1
sum(anova(res.lm1)$`Sum Sq`[1:2])
## [1] 486.7781
sum((res.lm1$fitted.values-mean(carbohydrate$carbohydrate))^2)
## [1] 486.7781
#R-square
summary(res.lm)$r.squared
## [1] 0.4805428
1-anova(res.lm)$`Sum Sq`[4]/sum(anova(res.lm)$`Sum Sq`)
## [1] 0.4805428
summary(res.lm1)$r.squared
## [1] 0.4454411
1-anova(res.lm1)$`Sum Sq`[3]/sum(anova(res.lm1)$`Sum Sq`)
## [1] 0.4454411

Adjusted \(R^2\) is computed as:

\[1 - (1 - R^2) \frac{n - 1}{n - p - 1}\]

#res.lm
summary(res.lm)$adj.r.squared
## [1] 0.3831445
1-(1-summary(res.lm)$r.squared)*(20-1)/(20-3-1)
## [1] 0.3831445
#res.lm1
summary(res.lm1)$adj.r.squared
## [1] 0.3801989
1-(1-summary(res.lm1)$r.squared)*(20-1)/(20-2-1)
## [1] 0.3801989

The square root of \(R^2\) is called the multiple correlation coefficient.

anova(res.lm, res.lm1)
## Analysis of Variance Table
## 
## Model 1: carbohydrate ~ age + weight + protein
## Model 2: carbohydrate ~ weight + protein
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     16 567.66                           
## 2     17 606.02 -1   -38.359 1.0812 0.3139
#F statistic
38.359/(567.66/16)
## [1] 1.081182
1-pf(38.359/(567.66/16), df1=1, df2=16)
## [1] 0.3138919
data(carbohydrate)
res.glm=glm(carbohydrate~age+weight+protein,family=gaussian,data=carbohydrate)
summary(res.glm)
## 
## Call:
## glm(formula = carbohydrate ~ age + weight + protein, family = gaussian, 
##     data = carbohydrate)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 36.96006   13.07128   2.828  0.01213 * 
## age         -0.11368    0.10933  -1.040  0.31389   
## weight      -0.22802    0.08329  -2.738  0.01460 * 
## protein      1.95771    0.63489   3.084  0.00712 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 35.47893)
## 
##     Null deviance: 1092.80  on 19  degrees of freedom
## Residual deviance:  567.66  on 16  degrees of freedom
## AIC: 133.67
## 
## Number of Fisher Scoring iterations: 2
res.glm2=glm(carbohydrate~weight+protein,family=gaussian,data=carbohydrate)
summary(res.glm2)
## 
## Call:
## glm(formula = carbohydrate ~ weight + protein, family = gaussian, 
##     data = carbohydrate)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 33.13032   12.57155   2.635  0.01736 * 
## weight      -0.22165    0.08326  -2.662  0.01642 * 
## protein      1.82429    0.62327   2.927  0.00941 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 35.64835)
## 
##     Null deviance: 1092.80  on 19  degrees of freedom
## Residual deviance:  606.02  on 17  degrees of freedom
## AIC: 132.98
## 
## Number of Fisher Scoring iterations: 2
# Therefore, the difference in the residual sums of squares for Models (6.7) and (6.6) is 606.022−567.663 = 38.359.
res.glm2$deviance
## [1] 606.0219
res.glm$deviance
## [1] 567.6629
res.glm2$deviance - res.glm$deviance
## [1] 38.35907
#(606.02-567.66)
#The value F =38.359/35.489 = 1.08 is not significant compared with the F(1,16) distribution,
res.glm2$df.residual
## [1] 17
res.glm$df.residual
## [1] 16
(res.glm2$deviance - res.glm$deviance)/(res.glm$deviance/res.glm$df.residual)
## [1] 1.081179
Fstatistic <- (606.02-567.66)/(567.66/16)
Fstatistic
## [1] 1.081211
pf(Fstatistic, 1,16)
## [1] 0.6861142
library(cvTools)
full.model <- lm(carbohydrate ~ ., data = carbohydrate)
cvFit(full.model, data = carbohydrate, K = 5, R = 20, y = carbohydrate$carbohydrate)
## 5-fold CV results:
##       CV 
## 7.136062
library(olsrr)
full.model <- lm(carbohydrate ~ ., data = carbohydrate)
ols_step_both_p(full.model, details=TRUE)
## Stepwise Selection Method   
## ---------------------------
## 
## Candidate Terms: 
## 
## 1. age 
## 2. weight 
## 3. protein 
## 
## We are selecting variables based on p value...
## 
## 
## Stepwise Selection: Step 1 
## 
## - protein added 
## 
##                          Model Summary                          
## ---------------------------------------------------------------
## R                        0.463       RMSE                6.907 
## R-Squared                0.214       Coef. Var          18.369 
## Adj. R-Squared           0.171       MSE                47.703 
## Pred R-Squared          -0.010       MAE                 5.842 
## ---------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                 Sum of                                             
##                Squares        DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     234.150         1        234.150    4.909    0.0399 
## Residual       858.650        18         47.703                    
## Total         1092.800        19                                   
## -------------------------------------------------------------------
## 
##                                   Parameter Estimates                                    
## ----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta      t       Sig       lower     upper 
## ----------------------------------------------------------------------------------------
## (Intercept)    12.479        11.444                 1.090    0.290    -11.563    36.521 
##     protein     1.580         0.713        0.463    2.216    0.040      0.082     3.078 
## ----------------------------------------------------------------------------------------
## 
## 
## 
## Stepwise Selection: Step 2 
## 
## - weight added 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.667       RMSE                5.971 
## R-Squared               0.445       Coef. Var          15.879 
## Adj. R-Squared          0.380       MSE                35.648 
## Pred R-Squared          0.236       MAE                 4.593 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                 Sum of                                             
##                Squares        DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     486.778         2        243.389    6.827    0.0067 
## Residual       606.022        17         35.648                    
## Total         1092.800        19                                   
## -------------------------------------------------------------------
## 
##                                   Parameter Estimates                                    
## ----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## ----------------------------------------------------------------------------------------
## (Intercept)    33.130        12.572                  2.635    0.017     6.607    59.654 
##     protein     1.824         0.623        0.534     2.927    0.009     0.509     3.139 
##      weight    -0.222         0.083       -0.486    -2.662    0.016    -0.397    -0.046 
## ----------------------------------------------------------------------------------------
## 
## 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.667       RMSE                5.971 
## R-Squared               0.445       Coef. Var          15.879 
## Adj. R-Squared          0.380       MSE                35.648 
## Pred R-Squared          0.236       MAE                 4.593 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                 Sum of                                             
##                Squares        DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     486.778         2        243.389    6.827    0.0067 
## Residual       606.022        17         35.648                    
## Total         1092.800        19                                   
## -------------------------------------------------------------------
## 
##                                   Parameter Estimates                                    
## ----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## ----------------------------------------------------------------------------------------
## (Intercept)    33.130        12.572                  2.635    0.017     6.607    59.654 
##     protein     1.824         0.623        0.534     2.927    0.009     0.509     3.139 
##      weight    -0.222         0.083       -0.486    -2.662    0.016    -0.397    -0.046 
## ----------------------------------------------------------------------------------------
## 
## 
## 
## No more variables to be added/removed.
## 
## 
## Final Model Output 
## ------------------
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.667       RMSE                5.971 
## R-Squared               0.445       Coef. Var          15.879 
## Adj. R-Squared          0.380       MSE                35.648 
## Pred R-Squared          0.236       MAE                 4.593 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                 Sum of                                             
##                Squares        DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     486.778         2        243.389    6.827    0.0067 
## Residual       606.022        17         35.648                    
## Total         1092.800        19                                   
## -------------------------------------------------------------------
## 
##                                   Parameter Estimates                                    
## ----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## ----------------------------------------------------------------------------------------
## (Intercept)    33.130        12.572                  2.635    0.017     6.607    59.654 
##     protein     1.824         0.623        0.534     2.927    0.009     0.509     3.139 
##      weight    -0.222         0.083       -0.486    -2.662    0.016    -0.397    -0.046 
## ----------------------------------------------------------------------------------------
## 
##                              Stepwise Selection Summary                              
## ------------------------------------------------------------------------------------
##                      Added/                   Adj.                                      
## Step    Variable    Removed     R-Square    R-Square     C(p)       AIC        RMSE     
## ------------------------------------------------------------------------------------
##    1    protein     addition       0.214       0.171    8.2020    137.9501    6.9067    
##    2     weight     addition       0.445       0.380    3.0810    132.9812    5.9706    
## ------------------------------------------------------------------------------------
#R code (lasso)
library(glmnet)
y = carbohydrate$carbohydrate
x = as.matrix(carbohydrate[,c('age','weight','protein')])
fit = glmnet(x, y)
plot(fit, xvar='lambda')

cvfit = cv.glmnet(x, y, grouped=FALSE)
coef(cvfit, s = "lambda.1se")
## 4 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) 33.889042731
## age         -0.004292978
## weight      -0.148035052
## protein      1.276513112
plant.dried
## # A tibble: 30 × 2
##    group      weight
##    <chr>       <dbl>
##  1 Control      4.17
##  2 Control      5.58
##  3 Control      5.18
##  4 Control      6.11
##  5 Control      4.5 
##  6 Control      4.61
##  7 Control      5.17
##  8 Control      4.53
##  9 Control      5.33
## 10 Control      5.14
## 11 TreatmentA   4.81
## 12 TreatmentA   4.17
## 13 TreatmentA   4.41
## 14 TreatmentA   3.59
## 15 TreatmentA   5.87
## 16 TreatmentA   3.83
## 17 TreatmentA   6.03
## 18 TreatmentA   4.89
## 19 TreatmentA   4.32
## 20 TreatmentA   4.69
## 21 TreatmentB   6.31
## 22 TreatmentB   5.12
## 23 TreatmentB   5.54
## 24 TreatmentB   5.5 
## 25 TreatmentB   5.37
## # ℹ 5 more rows
res.glm1=glm(weight~-1 + group, family=gaussian, data=plant.dried)
summary(res.glm1)
## 
## Call:
## glm(formula = weight ~ -1 + group, family = gaussian, data = plant.dried)
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## groupControl      5.0320     0.1971   25.53   <2e-16 ***
## groupTreatmentA   4.6610     0.1971   23.64   <2e-16 ***
## groupTreatmentB   5.5260     0.1971   28.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3885959)
## 
##     Null deviance: 786.318  on 30  degrees of freedom
## Residual deviance:  10.492  on 27  degrees of freedom
## AIC: 61.619
## 
## Number of Fisher Scoring iterations: 2
res.glm0=glm(weight~1, family=gaussian, data=plant.dried)
summary(res.glm0)
## 
## Call:
## glm(formula = weight ~ 1, family = gaussian, data = plant.dried)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.073      0.128   39.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.49167)
## 
##     Null deviance: 14.258  on 29  degrees of freedom
## Residual deviance: 14.258  on 29  degrees of freedom
## AIC: 66.821
## 
## Number of Fisher Scoring iterations: 2
balanced
## # A tibble: 12 × 3
##    `factor A` `factor B`  data
##    <chr>      <chr>      <dbl>
##  1 A1         B1           6.8
##  2 A1         B1           6.6
##  3 A1         B2           5.3
##  4 A1         B2           6.1
##  5 A2         B1           7.5
##  6 A2         B1           7.4
##  7 A2         B2           7.2
##  8 A2         B2           6.5
##  9 A3         B1           7.8
## 10 A3         B1           9.1
## 11 A3         B2           8.8
## 12 A3         B2           9.1
res.glmint=glm(data~`factor A`*`factor B`, family=gaussian, data=balanced)
res.glmadd=glm(data~`factor A`+`factor B`, family=gaussian, data=balanced)
res.glmA=glm(data~`factor A`, family=gaussian, data=balanced)
res.glmB=glm(data~`factor B`, family=gaussian, data=balanced)
res.glmmean=glm(data~1, family=gaussian, data=balanced)
res.glmint
## 
## Call:  glm(formula = data ~ `factor A` * `factor B`, family = gaussian, 
##     data = balanced)
## 
## Coefficients:
##               (Intercept)               `factor A`A2  
##                      6.70                       0.75  
##              `factor A`A3               `factor B`B2  
##                      1.75                      -1.00  
## `factor A`A2:`factor B`B2  `factor A`A3:`factor B`B2  
##                      0.40                       1.50  
## 
## Degrees of Freedom: 11 Total (i.e. Null);  6 Residual
## Null Deviance:	    15.83 
## Residual Deviance: 1.48 	AIC: 22.94
summary(res.glmint)
## 
## Call:
## glm(formula = data ~ `factor A` * `factor B`, family = gaussian, 
##     data = balanced)
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 6.7000     0.3512  19.078 1.34e-06 ***
## `factor A`A2                0.7500     0.4967   1.510   0.1818    
## `factor A`A3                1.7500     0.4967   3.524   0.0125 *  
## `factor B`B2               -1.0000     0.4967  -2.013   0.0907 .  
## `factor A`A2:`factor B`B2   0.4000     0.7024   0.569   0.5897    
## `factor A`A3:`factor B`B2   1.5000     0.7024   2.136   0.0766 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2466667)
## 
##     Null deviance: 15.83  on 11  degrees of freedom
## Residual deviance:  1.48  on  6  degrees of freedom
## AIC: 22.94
## 
## Number of Fisher Scoring iterations: 2
anova(res.glmint)
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: data
## 
## Terms added sequentially (first to last)
## 
## 
##                       Df Deviance Resid. Df Resid. Dev
## NULL                                     11    15.8300
## `factor A`             2  12.7400         9     3.0900
## `factor B`             1   0.4033         8     2.6867
## `factor A`:`factor B`  2   1.2067         6     1.4800
#b^TX^Ty = y^y-(y-Xb)^T(y-Xb)
sum(res.glmint$data$data^2)
## [1] 664.1
crossprod(res.glmint$data$data)
##       [,1]
## [1,] 664.1
res.glmint$data$data-res.glmint$fitted.values
##     1     2     3     4     5     6     7     8     9    10    11    12 
##  0.10 -0.10 -0.40  0.40  0.05 -0.05  0.35 -0.35 -0.65  0.65 -0.15  0.15
res.glmint$residuals
##     1     2     3     4     5     6     7     8     9    10    11    12 
##  0.10 -0.10 -0.40  0.40  0.05 -0.05  0.35 -0.35 -0.65  0.65 -0.15  0.15
sum(res.glmint$data$data^2) - sum((res.glmint$data$data-res.glmint$fitted.values)^2)
## [1] 662.62
summary(res.glmadd)
## 
## Call:
## glm(formula = data ~ `factor A` + `factor B`, family = gaussian, 
##     data = balanced)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.3833     0.3346  19.079  5.9e-08 ***
## `factor A`A2   0.9500     0.4098   2.318 0.049047 *  
## `factor A`A3   2.5000     0.4098   6.101 0.000289 ***
## `factor B`B2  -0.3667     0.3346  -1.096 0.305015    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3358333)
## 
##     Null deviance: 15.8300  on 11  degrees of freedom
## Residual deviance:  2.6867  on  8  degrees of freedom
## AIC: 26.095
## 
## Number of Fisher Scoring iterations: 2
sum(res.glmadd$data$data^2) - sum((res.glmadd$residuals)^2)
## [1] 661.4133
summary(res.glmA)
## 
## Call:
## glm(formula = data ~ `factor A`, family = gaussian, data = balanced)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.2000     0.2930  21.162 5.51e-09 ***
## `factor A`A2   0.9500     0.4143   2.293 0.047550 *  
## `factor A`A3   2.5000     0.4143   6.034 0.000194 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3433333)
## 
##     Null deviance: 15.83  on 11  degrees of freedom
## Residual deviance:  3.09  on  9  degrees of freedom
## AIC: 25.774
## 
## Number of Fisher Scoring iterations: 2
sum(res.glmA$data$data^2) - sum((res.glmA$residuals)^2)
## [1] 661.01
summary(res.glmB)
## 
## Call:
## glm(formula = data ~ `factor B`, family = gaussian, data = balanced)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.5333     0.5071  14.857 3.83e-08 ***
## `factor B`B2  -0.3667     0.7171  -0.511     0.62    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.542667)
## 
##     Null deviance: 15.830  on 11  degrees of freedom
## Residual deviance: 15.427  on 10  degrees of freedom
## AIC: 43.069
## 
## Number of Fisher Scoring iterations: 2
sum(res.glmB$data$data^2) - sum((res.glmB$residuals)^2)
## [1] 648.6733
summary(res.glmmean)
## 
## Call:
## glm(formula = data ~ 1, family = gaussian, data = balanced)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.3500     0.3463   21.22 2.82e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.439091)
## 
##     Null deviance: 15.83  on 11  degrees of freedom
## Residual deviance: 15.83  on 11  degrees of freedom
## AIC: 41.379
## 
## Number of Fisher Scoring iterations: 2
sum(res.glmmean$data$data^2) - sum((res.glmmean$residuals)^2)
## [1] 648.27
#Table6.13
tibble(
  d.f.=c(res.glmint$df.residual,
  res.glmadd$df.residual,
  res.glmA$df.residual,
  res.glmB$df.residual,
  res.glmmean$df.residual),
  bTXTy=c(sum(res.glmint$data$data^2) - sum((res.glmint$residuals)^2),
          sum(res.glmadd$data$data^2) - sum((res.glmadd$residuals)^2),
          sum(res.glmA$data$data^2) - sum((res.glmA$residuals)^2),
          sum(res.glmB$data$data^2) - sum((res.glmB$residuals)^2),
          sum(res.glmmean$data$data^2) - sum((res.glmmean$residuals)^2)),
  scaled_deviance= c(res.glmint$deviance,
  res.glmadd$deviance,
  res.glmA$deviance,
  res.glmB$deviance,
  res.glmmean$deviance),
  ) |> as.data.frame()
##   d.f.    bTXTy scaled_deviance
## 1    6 662.6200        1.480000
## 2    8 661.4133        2.686667
## 3    9 661.0100        3.090000
## 4   10 648.6733       15.426667
## 5   11 648.2700       15.830000
sigmaDs <- res.glmint$deviance
sigmaDi <- res.glmadd$deviance
#F
((sigmaDi-sigmaDs)/(res.glmadd$df.residual-res.glmint$df.residual))/(sigmaDs/res.glmint$df.residual)
## [1] 2.445946
df1=res.glmadd$df.residual-res.glmint$df.residual
df1
## [1] 2
df2=res.glmint$df.residual
df2
## [1] 6
#F
Fstatistic=((sigmaDi-sigmaDs)/(res.glmadd$df.residual-res.glmint$df.residual))/(sigmaDs/res.glmint$df.residual)
1-pf(Fstatistic, df1=df1, df2=df2)
## [1] 0.1671644
sigmaDs <- res.glmint$deviance
sigmaDi <- res.glmadd$deviance
sigmaDa <- res.glmA$deviance
df1=res.glmA$df.residual-res.glmadd$df.residual
df1
## [1] 1
df2=res.glmint$df.residual
df2
## [1] 6
Fstatistic=((sigmaDa-sigmaDi)/(res.glmA$df.residual-res.glmadd$df.residual))/(sigmaDs/res.glmint$df.residual)
Fstatistic
## [1] 1.635135
1-pf(Fstatistic, df1=df1, df2=df2)
## [1] 0.2482245
sigmaDs <- res.glmint$deviance
sigmaDi <- res.glmadd$deviance
sigmaDb <- res.glmB$deviance
df1=res.glmB$df.residual-res.glmadd$df.residual
df1
## [1] 2
df2=res.glmint$df.residual
df2
## [1] 6
Fstatistic=((sigmaDb-sigmaDi)/(res.glmB$df.residual-res.glmadd$df.residual))/(sigmaDs/res.glmint$df.residual)
Fstatistic
## [1] 25.82432
1-pf(Fstatistic, df1=df1, df2=df2)
## [1] 0.001127422
#Table6.14
tibble(Source_of_variantion=c(
  "Mean",
  "Level A",
  "Level B",
  "Interactions",
  "Residual"
),
  d.f.=c(
  length(res.glmmean$coefficients),
  length(res.glmA$coefficients)-length(res.glmmean$coefficients),
  length(res.glmB$coefficients)-length(res.glmmean$coefficients),
  length(res.glmint$coefficients)-length(res.glmA$coefficients)-length(res.glmB$coefficients)+length(res.glmmean$coefficients),
  res.glmint$df.residual
  ),
  sum_of_squares=c(
    sum(res.glmmean$data$data^2) - sum((res.glmmean$residuals)^2),
    sum(res.glmA$data$data^2) - sum((res.glmA$residuals)^2) - (sum(res.glmmean$data$data^2) - sum((res.glmmean$residuals)^2)),
    sum(res.glmB$data$data^2) - sum((res.glmB$residuals)^2) - (sum(res.glmmean$data$data^2) - sum((res.glmmean$residuals)^2)),
    sum(res.glmint$data$data^2) - sum((res.glmint$residuals)^2) -
    (sum(res.glmA$data$data^2) - sum((res.glmA$residuals)^2)) - 
      (sum(res.glmB$data$data^2) - sum((res.glmB$residuals)^2)) +
      ((sum(res.glmmean$data$data^2) - sum((res.glmmean$residuals)^2))),
    sum((res.glmint$residuals)^2)
  ),
  Mean_squares = sum_of_squares/`d.f.`,
  "F"=c(
    " ",
    ((sigmaDb-sigmaDi)/(res.glmB$df.residual-res.glmadd$df.residual))/(sigmaDs/res.glmint$df.residual),
    ((sigmaDa-sigmaDi)/(res.glmA$df.residual-res.glmadd$df.residual))/(sigmaDs/res.glmint$df.residual),
    ((sigmaDi-sigmaDs)/(res.glmadd$df.residual-res.glmint$df.residual))/(sigmaDs/res.glmint$df.residual),
    " "
  )
  ) |> as.data.frame()
##   Source_of_variantion d.f. sum_of_squares Mean_squares                F
## 1                 Mean    1    648.2700000  648.2700000                 
## 2              Level A    2     12.7400000    6.3700000 25.8243243243243
## 3              Level B    1      0.4033333    0.4033333 1.63513513513513
## 4         Interactions    2      1.2066667    0.6033333 2.44594594594595
## 5             Residual    6      1.4800000    0.2466667
anova(res.glmmean, res.glmA, res.glmB, res.glmadd, res.glmint)
## Analysis of Deviance Table
## 
## Model 1: data ~ 1
## Model 2: data ~ `factor A`
## Model 3: data ~ `factor B`
## Model 4: data ~ `factor A` + `factor B`
## Model 5: data ~ `factor A` * `factor B`
##   Resid. Df Resid. Dev Df Deviance
## 1        11    15.8300            
## 2         9     3.0900  2  12.7400
## 3        10    15.4267 -1 -12.3367
## 4         8     2.6867  2  12.7400
## 5         6     1.4800  2   1.2067
#Table6.14
anova(res.glmmean, res.glmA, res.glmB, res.glmadd, res.glmint)$`Resid. Dev`
## [1] 15.830000  3.090000 15.426667  2.686667  1.480000
#sum_of_squares Mean
sum(res.glmmean$data$data^2) - sum((res.glmmean$residuals)^2)
## [1] 648.27
#sum_of_squares Level A
anova(res.glmmean, res.glmA, res.glmB, res.glmadd, res.glmint)$`Resid. Dev`[1] - anova(res.glmmean, res.glmA, res.glmB, res.glmadd, res.glmint)$`Resid. Dev`[2]
## [1] 12.74
#sum_of_squares Level B
anova(res.glmmean, res.glmA, res.glmB, res.glmadd, res.glmint)$`Resid. Dev`[1] - anova(res.glmmean, res.glmA, res.glmB, res.glmadd, res.glmint)$`Resid. Dev`[3]
## [1] 0.4033333
#sum_of_squares Interactions
anova(res.glmmean, res.glmA, res.glmB, res.glmadd, res.glmint)$`Resid. Dev`[4] - anova(res.glmmean, res.glmA, res.glmB, res.glmadd, res.glmint)$`Resid. Dev`[5]
## [1] 1.206667
#sum_of_squares Residual
anova(res.glmmean, res.glmA, res.glmB, res.glmadd, res.glmint)$`Resid. Dev`[5]
## [1] 1.48
#Table6.15
achievement
## # A tibble: 21 × 3
##    method     y     x
##    <chr>  <dbl> <dbl>
##  1 A          6     3
##  2 A          4     1
##  3 A          5     3
##  4 A          3     1
##  5 A          4     2
##  6 A          3     1
##  7 A          6     4
##  8 B          8     4
##  9 B          9     5
## 10 B          7     5
## 11 B          9     4
## 12 B          8     3
## 13 B          5     1
## 14 B          7     2
## 15 C          6     3
## 16 C          7     2
## 17 C          7     2
## 18 C          7     3
## 19 C          8     4
## 20 C          5     1
## 21 C          7     4
achievement |> ggplot(aes(x=x,
                          y=y,
                          group=method)) + geom_point(aes(shape=method, color=method), size=3) +
  theme_bw() + theme(
  # Hide panel borders and remove grid lines
  #panel.border = element_line(colour = "black"),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  # Change axis line
  #axis.line = element_line(colour = "black")
) + xlab("Gestational age (weeks)") +
  ylab("Birthweight (grams)") + scale_shape_manual(values=c(1, 3, 18))

#Model 6.13
res.glm=glm(y~x+method, family=gaussian, data=achievement)
summary(res.glm)
## 
## Call:
## glm(formula = y ~ x + method, family = gaussian, data = achievement)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.8367     0.4235   6.699 3.74e-06 ***
## x             0.7429     0.1421   5.227 6.84e-05 ***
## methodB       2.1878     0.4545   4.814 0.000162 ***
## methodC       1.8612     0.4240   4.390 0.000400 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.6060024)
## 
##     Null deviance: 63.810  on 20  degrees of freedom
## Residual deviance: 10.302  on 17  degrees of freedom
## AIC: 54.64
## 
## Number of Fisher Scoring iterations: 2
#Model 6.13
summary(res.glm)$coefficients
##              Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 2.8367347  0.4234705 6.698778 3.739415e-06
## x           0.7428571  0.1421270 5.226714 6.837988e-05
## methodB     2.1877551  0.4544618 4.813947 1.620169e-04
## methodC     1.8612245  0.4239569 4.390126 3.995625e-04
summary(res.glm)$coefficients[1]
## [1] 2.836735
summary(res.glm)$coefficients[1]+summary(res.glm)$coefficients[3]
## [1] 5.02449
summary(res.glm)$coefficients[1]+summary(res.glm)$coefficients[4]
## [1] 4.697959
summary(res.glm)$coefficients[2]
## [1] 0.7428571
#sigmaD1
res.glm$deviance
## [1] 10.30204
#Model 6.14
res.glm2=glm(y~x, family=gaussian, data=achievement)
summary(res.glm2)
## 
## Call:
## glm(formula = y ~ x, family = gaussian, data = achievement)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.4468     0.6112   5.639 1.95e-05 ***
## x             1.0106     0.2001   5.051 7.09e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.433371)
## 
##     Null deviance: 63.810  on 20  degrees of freedom
## Residual deviance: 27.234  on 19  degrees of freedom
## AIC: 71.054
## 
## Number of Fisher Scoring iterations: 2
#Model 6.14
summary(res.glm2)$coefficients
##             Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 3.446809  0.6112211 5.639217 1.946757e-05
## x           1.010638  0.2000691 5.051447 7.088061e-05
summary(res.glm2)$coefficients[1]
## [1] 3.446809
summary(res.glm2)$coefficients[2]
## [1] 1.010638
#sigmaD0
res.glm2$deviance
## [1] 27.23404
anova(res.glm2,res.glm)
## Analysis of Deviance Table
## 
## Model 1: y ~ x
## Model 2: y ~ x + method
##   Resid. Df Resid. Dev Df Deviance
## 1        19     27.234            
## 2        17     10.302  2   16.932
#b^TX^Ty
sum((res.glm2$data$y)^2) - sum((res.glm2$data$y-res.glm2$fitted.values)^2)
## [1] 853.766
sum(res.glm2$fitted.values^2)
## [1] 853.766
anova(res.glm2,res.glm)$`Resid. Df`
## [1] 19 17
#Table6.16
tibble(d.f.=c(length(res.glm2$coefficients),
              anova(res.glm2,res.glm)$`Resid. Df`[1]-anova(res.glm2,res.glm)$`Resid. Df`[2],
              anova(res.glm2,res.glm)$`Resid. Df`[2]),
       sum_of_squaresc=c(sum(res.glm2$fitted.values^2),
                         anova(res.glm2,res.glm)$`Resid. Dev`[1]-anova(res.glm2,res.glm)$`Resid. Dev`[2],
                         anova(res.glm2,res.glm)$`Resid. Dev`[2]),
       Mean_squares=sum_of_squaresc/`d.f.`,
       "F"=c("",Mean_squares[2]/Mean_squares[3],"")) |> as.data.frame()
##   d.f. sum_of_squaresc Mean_squares                F
## 1    2       853.76596  426.8829787                 
## 2    2        16.93200    8.4660009 13.9702431129244
## 3   17        10.30204    0.6060024
head(PLOS)
##   nchar authors
## 1   150       6
## 2    88      17
## 3    64       3
## 4   126      30
## 5    87       9
## 6   115       3
dim(PLOS)
## [1] 878   2
#Figure6.7
PLOS |> ggplot(aes(x=authors,
                   y=nchar))+geom_jitter(size=1)+theme_bw() + theme(
  # Hide panel borders and remove grid lines
  #panel.border = element_line(colour = "black"),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  # Change axis line
  #axis.line = element_line(colour = "black")
)

#Model6.15
lmodel = lm(nchar ~ authors, data=PLOS)
summary(lmodel)
## 
## Call:
## lm(formula = nchar ~ authors, data = PLOS)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -83.210 -23.156  -1.519  20.753  99.068 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  96.1061     1.7554   54.75   <2e-16 ***
## authors       2.1826     0.1737   12.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.89 on 876 degrees of freedom
## Multiple R-squared:  0.1527,	Adjusted R-squared:  0.1517 
## F-statistic: 157.8 on 1 and 876 DF,  p-value: < 2.2e-16
#Model6.16
qmodel = lm(nchar ~ authors + I(authors^2), data=PLOS)
summary(qmodel)
## 
## Call:
## lm(formula = nchar ~ authors + I(authors^2), data = PLOS)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -90.09 -21.33  -2.51  18.34 102.78 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  81.40444    2.47751   32.86  < 2e-16 ***
## authors       6.06579    0.50618   11.98  < 2e-16 ***
## I(authors^2) -0.15375    0.01891   -8.13 1.46e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.73 on 875 degrees of freedom
## Multiple R-squared:  0.2122,	Adjusted R-squared:  0.2104 
## F-statistic: 117.8 on 2 and 875 DF,  p-value: < 2.2e-16
sum(lmodel$residuals^2)
## [1] 947501.8
sum(qmodel$residuals^2)
## [1] 880950.2
anova(lmodel, qmodel)
## Analysis of Variance Table
## 
## Model 1: nchar ~ authors
## Model 2: nchar ~ authors + I(authors^2)
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    876 947502                                  
## 2    875 880950  1     66552 66.102 1.458e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Table6.21
#Fractional polynomials
p=c(-2,-1,-0.5,0,0.5,1,2,3)
for(i in p){
  model = lm(nchar ~ I(authors^i), data=PLOS)
  print(sum(model$residuals^2))
}
## [1] 926425.7
## [1] 872771.8
## [1] 852655.3
## [1] 1118208
## [1] 896880.9
## [1] 947501.8
## [1] 1025531
## [1] 1064216
p=c(-2,-1,-0.5,0,0.5,1,2,3)
model <- function(i){
  sum(lm(nchar ~ I(authors^i), data=PLOS)$residuals^2)
}

sapply(p, model)
## [1]  926425.7  872771.8  852655.3 1118207.9  896880.9  947501.8 1025530.8
## [8] 1064216.3
sugar
## # A tibble: 6 × 3
##   period  refined manufactured
##   <chr>     <dbl>        <dbl>
## 1 1936-39    32           16.3
## 2 1046-49    31.2         23.1
## 3 1956-59    27           23.6
## 4 1966-69    21           27.7
## 5 1976-79    14.9         34.6
## 6 1986-89     8.8         33.9
sugar2 <- sugar
sugar2["decade"] <- c(seq(0,5))
sugar2
## # A tibble: 6 × 4
##   period  refined manufactured decade
##   <chr>     <dbl>        <dbl>  <int>
## 1 1936-39    32           16.3      0
## 2 1046-49    31.2         23.1      1
## 3 1956-59    27           23.6      2
## 4 1966-69    21           27.7      3
## 5 1976-79    14.9         34.6      4
## 6 1986-89     8.8         33.9      5
model1 <- lm(refined ~ decade, data=sugar2)
summary(model1)
## 
## Call:
## lm(formula = refined ~ decade, data = sugar2)
## 
## Residuals:
##       1       2       3       4       5       6 
## -2.6905  1.3924  2.0752  0.9581 -0.2590 -1.4762 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  34.6905     1.4765   23.50 1.95e-05 ***
## decade       -4.8829     0.4877  -10.01 0.000559 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.04 on 4 degrees of freedom
## Multiple R-squared:  0.9616,	Adjusted R-squared:  0.952 
## F-statistic: 100.2 on 1 and 4 DF,  p-value: 0.0005593
#coefficients
model1$coefficients
## (Intercept)      decade 
##   34.690476   -4.882857
#extract standard error of individual regression coefficients
sqrt(diag(vcov(model1)))
## (Intercept)      decade 
##   1.4765230   0.4876795
#extract residual standard error of regression model
summary(model1)$sigma
## [1] 2.04011
#95% confidence interval
t_value <- qt(0.975, df=4)
t_value
## [1] 2.776445
sd <- sqrt(diag(vcov(model1)))[2]
model1$coefficients[2]-t_value*sd
##    decade 
## -6.236872
model1$coefficients[2]+t_value*sd
##    decade 
## -3.528842
model2 <- lm(manufactured ~ decade, data=sugar2)
summary(model2)
## 
## Call:
## lm(formula = manufactured ~ decade, data = sugar2)
## 
## Residuals:
##       1       2       3       4       5       6 
## -1.1905  1.9924 -1.1248 -0.6419  2.6410 -1.6762 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  17.4905     1.4854  11.775 0.000298 ***
## decade        3.6171     0.4906   7.373 0.001804 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.052 on 4 degrees of freedom
## Multiple R-squared:  0.9315,	Adjusted R-squared:  0.9143 
## F-statistic: 54.36 on 1 and 4 DF,  p-value: 0.001804
#coefficients
model2$coefficients
## (Intercept)      decade 
##   17.490476    3.617143
#extract standard error of individual regression coefficients
sqrt(diag(vcov(model2)))
## (Intercept)      decade 
##   1.4853655   0.4906001
#95% confidence interval
model2$coefficients[2]-t_value*sd
##   decade 
## 2.263128
model2$coefficients[2]+t_value*sd
##   decade 
## 4.971158
sugar2['Total'] <- sugar2['refined']+sugar2['manufactured']
model3 <- lm(Total ~ decade, data=sugar2)
summary(model3)
## 
## Call:
## lm(formula = Total ~ decade, data = sugar2)
## 
## Residuals:
##       1       2       3       4       5       6 
## -3.8810  3.3848  0.9505  0.3162  2.3819 -3.1524 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   52.181      2.377  21.956 2.55e-05 ***
## decade        -1.266      0.785  -1.612    0.182    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.284 on 4 degrees of freedom
## Multiple R-squared:  0.3939,	Adjusted R-squared:  0.2424 
## F-statistic:   2.6 on 1 and 4 DF,  p-value: 0.1822
#Table 6.23
pasture
## # A tibble: 27 × 2
##        K yield
##    <dbl> <dbl>
##  1     0 1754.
##  2    40 4923.
##  3    50 5246.
##  4     5 3185.
##  5    10 3538.
##  6    30 4000 
##  7    15 4185.
##  8    40 4692.
##  9    20 3600 
## 10    15 3108.
## 11    30 4415.
## 12    50 4938.
## 13     5 3046.
## 14     0 2554.
## 15    10 3323.
## 16    40 4462.
## 17    20 4215.
## 18    40 4154.
## 19    10 2400 
## 20     5 2862.
## 21    40 3723 
## 22    30 4892.
## 23    40 4785.
## 24    20 3185.
## 25     0 2723.
## # ℹ 2 more rows
#EXERCISES 6.2
pasture |> ggplot(aes(x=K,
                   y=yield))+geom_point(size=1)+theme_bw() + theme(
  # Hide panel borders and remove grid lines
  #panel.border = element_line(colour = "black"),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  # Change axis line
  #axis.line = element_line(colour = "black")
)

lmodel2 = lm(yield ~ K, data=pasture)
summary(lmodel2)
## 
## Call:
## lm(formula = yield ~ K, data = pasture)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -890.32 -231.14  -11.45  252.25  814.96 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2644.224    148.366  17.822 1.01e-15 ***
## K             48.361      5.192   9.315 1.32e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 445.1 on 25 degrees of freedom
## Multiple R-squared:  0.7763,	Adjusted R-squared:  0.7674 
## F-statistic: 86.77 on 1 and 25 DF,  p-value: 1.316e-09
qmodel2 = lm(yield ~ K + I(K^2), data=pasture)
summary(qmodel2)
## 
## Call:
## lm(formula = yield ~ K + I(K^2), data = pasture)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -876.18 -257.45   82.54  287.88  722.17 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2471.2637   196.1115  12.601 4.51e-12 ***
## K             73.8059    19.8999   3.709   0.0011 ** 
## I(K^2)        -0.5152     0.3894  -1.323   0.1983    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 438.6 on 24 degrees of freedom
## Multiple R-squared:  0.7915,	Adjusted R-squared:  0.7742 
## F-statistic: 45.56 on 2 and 24 DF,  p-value: 6.736e-09
anova(lmodel2, qmodel2)
## Analysis of Variance Table
## 
## Model 1: yield ~ K
## Model 2: yield ~ K + I(K^2)
##   Res.Df     RSS Df Sum of Sq      F Pr(>F)
## 1     25 4952793                           
## 2     24 4616095  1    336698 1.7506 0.1983
#standard deviation of residual
N=length(pasture$K)
p=3
sigma_sq <- (t(pasture$yield-qmodel2$fitted.values) %*% (pasture$yield-qmodel2$fitted.values))/(N-p)
sigma_hat <- sqrt(sigma_sq)
std_e <- as.vector(sigma_hat)*sqrt(1-hatvalues(qmodel2))
std_e
##        1        2        3        4        5        6        7        8 
## 392.2723 421.5502 382.4364 417.6786 424.3213 418.1877 422.4668 421.5502 
##        9       10       11       12       13       14       15       16 
## 418.6776 422.4668 418.1877 382.4364 417.6786 392.2723 424.3213 421.5502 
##       17       18       19       20       21       22       23       24 
## 418.6776 421.5502 424.3213 417.6786 421.5502 418.1877 421.5502 418.6776 
##       25       26       27 
## 392.2723 382.4364 422.4668
#standardized residuals
qmodel2$residuals/std_e
##           1           2           3           4           5           6 
## -1.82873907  0.76839933  0.97438347  0.85517076  0.89718998 -0.53029260 
##           7           8           9          10          11          12 
##  1.70940578  0.22089635 -0.33749718 -0.83967034  0.46304132  0.16954371 
##          13          14          15          16          17          18 
##  0.52381553  0.21040554  0.38955584 -0.32660664  1.13236888 -1.05629433 
##          19          20          21          22          23          24 
## -1.78591804  0.08184895 -2.07847386  1.60343841  0.43985010 -1.32966870 
##          25          26          27 
##  0.64199347 -0.23261469 -0.69386007
plot(qmodel2$residuals/std_e)

#EXERCISES6.3
m1 <- lm(carbohydrate~age+weight+protein,data=carbohydrate)
m2 <- lm(carbohydrate~weight+protein,data=carbohydrate)
m3 <- lm(carbohydrate~age+protein,data=carbohydrate)
m4 <- lm(carbohydrate~protein,data=carbohydrate)
anova(m1, m2, m3, m4)
## Analysis of Variance Table
## 
## Model 1: carbohydrate ~ age + weight + protein
## Model 2: carbohydrate ~ weight + protein
## Model 3: carbohydrate ~ age + protein
## Model 4: carbohydrate ~ protein
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     16 567.66                           
## 2     17 606.02 -1   -38.359 1.0812 0.3139
## 3     17 833.57  0  -227.549              
## 4     18 858.65 -1   -25.079 0.7069 0.4129
#EXERCISES6.4
cholesterol
## # A tibble: 30 × 3
##     chol   age   bmi
##    <dbl> <dbl> <dbl>
##  1  5.94    52  20.7
##  2  4.71    46  21.3
##  3  5.86    51  25.4
##  4  6.52    44  22.7
##  5  6.8     70  23.9
##  6  5.23    33  24.3
##  7  4.97    21  22.2
##  8  8.78    63  26.2
##  9  5.13    56  23.3
## 10  6.74    54  29.2
## 11  5.95    44  22.7
## 12  5.83    71  21.9
## 13  5.74    39  22.4
## 14  4.92    58  20.2
## 15  6.69    58  24.4
## 16  6.48    65  26.3
## 17  8.83    76  22.7
## 18  5.1     47  21.5
## 19  5.81    43  20.7
## 20  4.65    30  18.9
## 21  6.82    58  23.9
## 22  6.28    78  24.3
## 23  5.15    49  23.8
## 24  2.92    36  19.6
## 25  9.27    67  24.3
## # ℹ 5 more rows
m5 <- lm(chol~age+bmi,data=cholesterol)
m6 <- lm(chol~age,data=cholesterol)
anova(m5, m6)
## Analysis of Variance Table
## 
## Model 1: chol ~ age + bmi
## Model 2: chol ~ age
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     27 26.571                              
## 2     28 31.636 -1   -5.0655 5.1474 0.03149 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#EXERCISES6.5
plasma
## # A tibble: 31 × 2
##    Group phosphate
##    <chr>     <dbl>
##  1 H-O         2.3
##  2 H-O         4.1
##  3 H-O         4.2
##  4 H-O         4  
##  5 H-O         4.6
##  6 H-O         4.6
##  7 H-O         3.8
##  8 H-O         5.2
##  9 H-O         3.1
## 10 H-O         3.7
## 11 H-O         3.8
## 12 N-O         3  
## 13 N-O         4.1
## 14 N-O         3.9
## 15 N-O         3.1
## 16 N-O         3.3
## 17 N-O         2.9
## 18 N-O         3.3
## 19 N-O         3.9
## 20 C           3  
## 21 C           2.6
## 22 C           3.1
## 23 C           2.2
## 24 C           2.1
## 25 C           2.4
## # ℹ 6 more rows
m7 <- lm(phosphate~Group,data=plasma)
#Sum of squares: mean
length(plasma$phosphate)*mean(plasma$phosphate)^2
## [1] 350.919
anova(m7)
## Analysis of Variance Table
## 
## Response: phosphate
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## Group      2 7.8083  3.9041  11.651 0.0002082 ***
## Residuals 28 9.3827  0.3351                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tibble(d.f.=c(1,
              anova(m7)$`Df`[1],
              anova(m7)$`Df`[2]),
       Sum_of_squares=c(length(plasma$phosphate)*mean(plasma$phosphate)^2,
                        anova(m7)$`Sum Sq`[1],
                        anova(m7)$`Sum Sq`[2]),
       Mean_squares=Sum_of_squares/d.f.,
       "F"=c("",Mean_squares[2]/Mean_squares[3],"")) |> as.data.frame()
##   d.f. Sum_of_squares Mean_squares                F
## 1    1     350.919032   350.919032                 
## 2    2       7.808278     3.904139 11.6508063181284
## 3   28       9.382689     0.335096
1-pf(11.6508063181284, df1=2, df2=28)
## [1] 0.0002081509
#pooled standard deviation
N=length(plasma$phosphate)
p=3
sqrt(sum((plasma$phosphate-m7$fitted.values)^2)/(N-p))
## [1] 0.5788748
summary(m7)
## 
## Call:
## lm(formula = phosphate ~ Group, data = plasma)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.64545 -0.29148  0.01667  0.36667  1.25455 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.7833     0.1671  16.656 4.63e-16 ***
## GroupH-O      1.1621     0.2416   4.809 4.67e-05 ***
## GroupN-O      0.6542     0.2642   2.476   0.0196 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5789 on 28 degrees of freedom
## Multiple R-squared:  0.4542,	Adjusted R-squared:  0.4152 
## F-statistic: 11.65 on 2 and 28 DF,  p-value: 0.0002082
m7$coefficients
## (Intercept)    GroupH-O    GroupN-O 
##   2.7833333   1.1621212   0.6541667
plasma |> group_by(Group) |> summarise(n=n())
## # A tibble: 3 × 2
##   Group     n
##   <chr> <int>
## 1 C        12
## 2 H-O      11
## 3 N-O       8
t_value <- qt(0.975, df=28)
t_value
## [1] 2.048407
sd <- sqrt(sum((plasma$phosphate-m7$fitted.values)^2)/(N-p))
sd
## [1] 0.5788748
#95% percentile
m7$coefficients[2]-m7$coefficients[3]-t_value*sd*sqrt(1/11+1/8)
##    GroupH-O 
## -0.04302617
m7$coefficients[2]-m7$coefficients[3]+t_value*sd*sqrt(1/11+1/8)
## GroupH-O 
## 1.058935
#EXERCISES6.6
machine2 <- machine |> mutate(day=factor(day),
                  worker=factor(worker)) |> drop_na() |> as.data.frame()
machine2[37, 3] <- 32.9
machine2$weight
##  [1] 35.7 37.1 36.7 37.7 35.3 38.4 37.2 38.1 36.9 37.2 34.9 34.3 34.5 33.7 36.2
## [16] 37.1 35.5 36.5 36.0 33.8 34.7 35.2 34.6 36.4 35.2 36.9 38.5 36.4 37.8 36.1
## [31] 32.0 35.2 33.5 32.9 33.3 35.8 32.9 35.7 38.0 36.1
#Sum of squares: Mean
length(machine2$weight)*(mean(machine2$weight))^2
## [1] 51122.5
m8 <- lm(weight~1,data=machine2)
m9 <- lm(weight~worker,data=machine2)
m10 <- lm(weight~day,data=machine2)
m11 <- lm(weight~day+worker,data=machine2)
m12 <- lm(weight~day*worker,data=machine2)

anova(m8,m9,m10,m11,m12)
## Analysis of Variance Table
## 
## Model 1: weight ~ 1
## Model 2: weight ~ worker
## Model 3: weight ~ day
## Model 4: weight ~ day + worker
## Model 5: weight ~ day * worker
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1     39 103.860                                  
## 2     36  49.238  3    54.622 14.495 3.895e-06 ***
## 3     38  97.776 -2   -48.538 19.320 3.144e-06 ***
## 4     35  43.154  3    54.622 14.495 3.895e-06 ***
## 5     32  40.196  3     2.958  0.785    0.5112    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

There are significant differences between workers and between days but no evidence of interaction effects.

#EXERCISES6.7
balanced
## # A tibble: 12 × 3
##    `factor A` `factor B`  data
##    <chr>      <chr>      <dbl>
##  1 A1         B1           6.8
##  2 A1         B1           6.6
##  3 A1         B2           5.3
##  4 A1         B2           6.1
##  5 A2         B1           7.5
##  6 A2         B1           7.4
##  7 A2         B2           7.2
##  8 A2         B2           6.5
##  9 A3         B1           7.8
## 10 A3         B1           9.1
## 11 A3         B2           8.8
## 12 A3         B2           9.1
N=length(balanced$data)
y=balanced$data
X=matrix(c(rep(1,N), 
         c(rep(-1,4),rep(1,4),rep(0,4)),
         c(rep(-1,4),rep(0,4),rep(1,4)),
         c(rep(-1,2),rep(1,2),rep(-1,2),rep(1,2),rep(-1,2),rep(1,2)),
         c(rep(1,2),rep(-1,4),rep(1,2),rep(0,4)),
         c(rep(1,2),rep(-1,2),rep(0,4),rep(-1,2),rep(1,2))),
         ncol = 6,
         byrow = FALSE)
X
##       [,1] [,2] [,3] [,4] [,5] [,6]
##  [1,]    1   -1   -1   -1    1    1
##  [2,]    1   -1   -1   -1    1    1
##  [3,]    1   -1   -1    1   -1   -1
##  [4,]    1   -1   -1    1   -1   -1
##  [5,]    1    1    0   -1   -1    0
##  [6,]    1    1    0   -1   -1    0
##  [7,]    1    1    0    1    1    0
##  [8,]    1    1    0    1    1    0
##  [9,]    1    0    1   -1    0   -1
## [10,]    1    0    1   -1    0   -1
## [11,]    1    0    1    1    0    1
## [12,]    1    0    1    1    0    1
crossprod(X)
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]   12    0    0    0    0    0
## [2,]    0    8    4    0    0    0
## [3,]    0    4    8    0    0    0
## [4,]    0    0    0   12    0    0
## [5,]    0    0    0    0    8    4
## [6,]    0    0    0    0    4    8

Model6.9

\[E(Y_{jkl}) = \mu +\alpha_j+\beta_k+(\alpha\beta)_{jk}\] Model6.9

\[E(Y_{jkl}) = \mu +\alpha_j+\beta_k\]

Model6.10

\[E(Y_{jkl}) = \mu +\alpha_j\]

Model6.11

\[E(Y_{jkl}) = \mu +\beta_k\]

#Model6.9
#E(Yjkl) = μ +aj+bk+(ab)jk,
N=dim(X)[1]
p=dim(X)[1]-dim(X)[2]
b1=crossprod(solve(crossprod(X)), crossprod(X, y))
b1
##            [,1]
## [1,]  7.3500000
## [2,] -0.2000000
## [3,]  1.3500000
## [4,] -0.1833333
## [5,] -0.1166667
## [6,]  0.4333333
sigma.sq = crossprod((y-X %*% b1))/(N-p)
#bXy
crossprod(b1, crossprod(X, y))
##        [,1]
## [1,] 662.62
sigma.sq.D1 = crossprod(y) - crossprod(b1, crossprod(X, y))
sigma.sq.D1
##      [,1]
## [1,] 1.48
Deviance1 = sigma.sq.D1/sigma.sq
Deviance1
##      [,1]
## [1,]    6
Deviance_f <- function(X1){
  N=dim(X1)[1]
  p1=dim(X1)[1]-dim(X1)[2]
  b=crossprod(solve(crossprod(X1)), crossprod(X1, y))
  sigma_sq = crossprod((y-X1 %*% b))/(N-p1)
  sigma_sq_D1 = crossprod(y) - crossprod(b, crossprod(X1, y))
  return(sigma_sq_D1)
}
Deviance_f(X[,1:4])
##          [,1]
## [1,] 2.686667

Chapter7 Binary Variables and Logistic Regression

y=c(6,13,18,28,52,53,61,60)
n=c(59,60,62,56,63,59,62,60)
x=c(1.6907,1.7242,1.7552,1.7842,1.8113,1.8369,1.8610,1.8839)
n_y=n-y
beetle.mat=cbind(y,n_y)
beetle.mat
##       y n_y
## [1,]  6  53
## [2,] 13  47
## [3,] 18  44
## [4,] 28  28
## [5,] 52  11
## [6,] 53   6
## [7,] 61   1
## [8,] 60   0
y=c(6,13,18,28,52,53,61,60)
n=c(59,60,62,56,63,59,62,60)
x=c(1.6907,1.7242,1.7552,1.7842,1.8113,1.8369,1.8610,1.8839)
X1=matrix(c(rep(1, length(x)), x),
         ncol = 2,
         byrow = FALSE)

B=matrix(c(0, 0), ncol = 1, byrow = FALSE)
for(i in seq(1:6)){
  pii = exp(X1 %*% B)/(1+exp(X1 %*% B))
  npi1_pi = n*pii*(1-pii)
  U=matrix(c(sum(y-n*pii), 
             sum((y-n*pii)*x)),
           ncol = 1, byrow = FALSE)
  J=matrix(c(sum(npi1_pi),
             sum(npi1_pi*x),
             sum(npi1_pi*x),
             sum(npi1_pi*x*x)),
           ncol=2,
           byrow = TRUE)
  J_1 = solve(J)
  B=J_1 %*% (crossprod(J, B)+U)
  print(paste0("iter: ", i))
  print("coefficients:")
  print(B)
  print("variances:")
  print(diag(J_1))
  writeLines("\n")
}
## [1] "iter: 1"
## [1] "coefficients:"
##           [,1]
## [1,] -37.85638
## [2,]  21.33743
## [1] "variances:"
## [1] 6.742888 2.092829
## 
## 
## [1] "iter: 2"
## [1] "coefficients:"
##           [,1]
## [1,] -53.85319
## [2,]  30.38351
## [1] "variances:"
## [1] 13.045111  4.087611
## 
## 
## [1] "iter: 3"
## [1] "coefficients:"
##           [,1]
## [1,] -59.96521
## [2,]  33.84419
## [1] "variances:"
## [1] 21.500222  6.777708
## 
## 
## [1] "iter: 4"
## [1] "coefficients:"
##           [,1]
## [1,] -60.70778
## [2,]  34.26485
## [1] "variances:"
## [1] 26.195147  8.274844
## 
## 
## [1] "iter: 5"
## [1] "coefficients:"
##           [,1]
## [1,] -60.71745
## [2,]  34.27032
## [1] "variances:"
## [1] 26.831382  8.477882
## 
## 
## [1] "iter: 6"
## [1] "coefficients:"
##           [,1]
## [1,] -60.71745
## [2,]  34.27033
## [1] "variances:"
## [1] 26.839770  8.480559
sqrt(diag(J_1))
## [1] 5.180711 2.912140
B
##           [,1]
## [1,] -60.71745
## [2,]  34.27033
y=c(6,13,18,28,52,53,61,60)
n=c(59,60,62,56,63,59,62,60)
x=c(1.6907,1.7242,1.7552,1.7842,1.8113,1.8369,1.8610,1.8839)
#sigma_sq
sigma_sq <- crossprod(y-n*exp(X1 %*% B)/(1+exp(X1 %*% B)))/(dim(X1)[1]-dim(X1)[2])

#Deviance
pi=y/n
pi_hat=exp(X1 %*% B)/(1+exp(X1 %*% B))
#add small error to get rid of 0
Deviance=2*sum((y*log(pi/pi_hat)+(n-y)*log((1-pi+1/1e10)/(1-pi_hat+1/1e10))))
Deviance
## [1] 11.23223
#add small error to get rid of 0
p_null <- sum(y)/sum(n)
Deviance_null=2*sum((y*log(pi/p_null)+(n-y)*log((1-pi+1/1e10)/(1-p_null+1/1e10))))
Deviance_null
## [1] 284.2024
res.glm1=glm(beetle.mat~x, family=binomial(link="logit"))
summary(res.glm1)
## 
## Call:
## glm(formula = beetle.mat ~ x, family = binomial(link = "logit"))
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -60.717      5.181  -11.72   <2e-16 ***
## x             34.270      2.912   11.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 284.202  on 7  degrees of freedom
## Residual deviance:  11.232  on 6  degrees of freedom
## AIC: 41.43
## 
## Number of Fisher Scoring iterations: 4
summary(res.glm1)
## 
## Call:
## glm(formula = beetle.mat ~ x, family = binomial(link = "logit"))
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -60.717      5.181  -11.72   <2e-16 ***
## x             34.270      2.912   11.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 284.202  on 7  degrees of freedom
## Residual deviance:  11.232  on 6  degrees of freedom
## AIC: 41.43
## 
## Number of Fisher Scoring iterations: 4
logLik(res.glm1)
## 'log Lik.' -18.71513 (df=2)
#mini model with no explanatory variable
res.glm.mini=glm(beetle.mat~1, family=binomial(link="logit"))
summary(res.glm.mini)
## 
## Call:
## glm(formula = beetle.mat ~ 1, family = binomial(link = "logit"))
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.42630    0.09327   4.571 4.87e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 284.2  on 7  degrees of freedom
## Residual deviance: 284.2  on 7  degrees of freedom
## AIC: 312.4
## 
## Number of Fisher Scoring iterations: 4
logLik(res.glm.mini)
## 'log Lik.' -155.2002 (df=1)
#AIC
-2*logLik(res.glm.mini)+2*1
## 'log Lik.' 312.4005 (df=1)
y=c(6,13,18,28,52,53,61,60)
n=c(59,60,62,56,63,59,62,60)
x=c(1.6907,1.7242,1.7552,1.7842,1.8113,1.8369,1.8610,1.8839)
#log likelihood of model with no explanatory variable
pi_tide <- sum(y)/sum(n)
sum(log(choose(n, y)))
## [1] 167.5203
sum(y*log(pi_tide)+(n-y)*log(1-pi_tide))
## [1] -322.7205
sum(log(choose(n, y))+y*log(pi_tide)+(n-y)*log(1-pi_tide))
## [1] -155.2002
fitted.values(res.glm1)
##          1          2          3          4          5          6          7 
## 0.05860103 0.16402787 0.36211901 0.60531491 0.79517177 0.90323582 0.95519611 
##          8 
## 0.97904934
fit_p=c(fitted.values(res.glm1))
fit_y=n*fit_p
fit_y
##         1         2         3         4         5         6         7         8 
##  3.457461  9.841672 22.451378 33.897635 50.095822 53.290913 59.222159 58.742961
#logistic model 
res.glm1=glm(beetle.mat~x, family=binomial(link="logit"))
summary(res.glm1)
## 
## Call:
## glm(formula = beetle.mat ~ x, family = binomial(link = "logit"))
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -60.717      5.181  -11.72   <2e-16 ***
## x             34.270      2.912   11.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 284.202  on 7  degrees of freedom
## Residual deviance:  11.232  on 6  degrees of freedom
## AIC: 41.43
## 
## Number of Fisher Scoring iterations: 4
logLik(res.glm1)
## 'log Lik.' -18.71513 (df=2)
res.glm1$deviance
## [1] 11.23223
res.glm1$residuals
##           1           2           3           4           5           6 
##  0.78115418  0.38388091 -0.31082206 -0.44081641  0.18557365 -0.05641516 
##           7           8 
##  0.67002811  1.02139898
res.glm1$resid
##           1           2           3           4           5           6 
##  0.78115418  0.38388091 -0.31082206 -0.44081641  0.18557365 -0.05641516 
##           7           8 
##  0.67002811  1.02139898
#working
resid(res.glm1,type="working")
##           1           2           3           4           5           6 
##  0.78115418  0.38388091 -0.31082206 -0.44081641  0.18557365 -0.05641516 
##           7           8 
##  0.67002811  1.02139898
#deviance
resid(res.glm1,type="dev")
##          1          2          3          4          5          6          7 
##  1.2836777  1.0596900 -1.1961123 -1.5941244  0.6061405 -0.1271584  1.2510711 
##          8 
##  1.5939850
#response
resid(res.glm1,type="resp")
##            1            2            3            4            5            6 
##  0.043093890  0.052638798 -0.071796425 -0.105314906  0.030225053 -0.004930735 
##            7            8 
##  0.028674861  0.020950656
#pearson
resid(res.glm1,type="pear")
##          1          2          3          4          5          6          7 
##  1.4092960  1.1011003 -1.1762596 -1.6123815  0.5944454 -0.1281090  1.0914228 
##          8 
##  1.1331102
#predict values are the linear function: beta*x
#fitted.values are the logistic of predict
res.glm1$fitted.values
##          1          2          3          4          5          6          7 
## 0.05860103 0.16402787 0.36211901 0.60531491 0.79517177 0.90323582 0.95519611 
##          8 
## 0.97904934
predict(res.glm1)
##          1          2          3          4          5          6          7 
## -2.7766148 -1.6285589 -0.5661788  0.4276606  1.3563864  2.2337068  3.0596216 
##          8 
##  3.8444121
exp(predict(res.glm1))/(1+exp(predict(res.glm1)))
##          1          2          3          4          5          6          7 
## 0.05860103 0.16402787 0.36211901 0.60531491 0.79517177 0.90323582 0.95519611 
##          8 
## 0.97904934
#working residuals
resid(res.glm1,type="working")
##           1           2           3           4           5           6 
##  0.78115418  0.38388091 -0.31082206 -0.44081641  0.18557365 -0.05641516 
##           7           8 
##  0.67002811  1.02139898
#predict mu=beta*x
mu = predict(res.glm1)
#pi is logistic of predict
pi = exp(mu)/(1+exp(mu))
(y/n-pi) / (pi*(1-pi))
##           1           2           3           4           5           6 
##  0.78115418  0.38388091 -0.31082206 -0.44081641  0.18557365 -0.05641516 
##           7           8 
##  0.67002811  1.02139898
#response residuals
resid(res.glm1,type="resp")
##            1            2            3            4            5            6 
##  0.043093890  0.052638798 -0.071796425 -0.105314906  0.030225053 -0.004930735 
##            7            8 
##  0.028674861  0.020950656
y/n - res.glm1$fitted.values
##            1            2            3            4            5            6 
##  0.043093890  0.052638798 -0.071796425 -0.105314906  0.030225053 -0.004930735 
##            7            8 
##  0.028674861  0.020950656
#pearson residuals
resid(res.glm1,type="pear")
##          1          2          3          4          5          6          7 
##  1.4092960  1.1011003 -1.1762596 -1.6123815  0.5944454 -0.1281090  1.0914228 
##          8 
##  1.1331102
pi = exp(mu)/(1+exp(mu))
(y/n-pi) / sqrt((pi*(1-pi))/n)
##          1          2          3          4          5          6          7 
##  1.4092960  1.1011003 -1.1762596 -1.6123815  0.5944454 -0.1281090  1.0914228 
##          8 
##  1.1331102
#or
(y-n*pi) / sqrt(n*pi*(1-pi))
##          1          2          3          4          5          6          7 
##  1.4092960  1.1011003 -1.1762596 -1.6123815  0.5944454 -0.1281090  1.0914228 
##          8 
##  1.1331102

Deviance is \[\begin{align} D&=2[l(b_{max};y)-l(b;y)]\\ &=2\sum_{i=1}^{N}\left[y_i\log(y_i/\hat{y})+(n_i-y_i)\log\left(\frac{n_i-y_i}{n_i-\hat{y}}\right)\right]\\ \end{align}\]

For Deviance for a Poisson model: \[sign(o_i-e_i)*\sqrt{d_i}=sign(o_i-e_i)*\sqrt{2[o_i\log(o_i/e_i)-(o_i-e_i)]}, \quad i=1,\dots,N\]

#Deviance residuals (add small error to get rid of log0)
resid(res.glm1,type="dev")
##          1          2          3          4          5          6          7 
##  1.2836777  1.0596900 -1.1961123 -1.5941244  0.6061405 -0.1271584  1.2510711 
##          8 
##  1.5939850
pi = exp(mu)/(1+exp(mu))
sqrt(-2*(y*log(pi/(y/n))+(n-y)*log((1-pi+1e-10)/(1-y/n+1e-10))))*sign(y/n-pi)
##          1          2          3          4          5          6          7 
##  1.2836777  1.0596900 -1.1961123 -1.5941244  0.6061405 -0.1271584  1.2510711 
##          8 
##  1.5939850
#sum squared deviance residuals is the model deviance
sum(resid(res.glm1,type="dev")^2)
## [1] 11.23223
res.glm1$deviance
## [1] 11.23223
sum(-2*(y*log(pi/(y/n))+(n-y)*log((1-pi+1e-10)/(1-y/n+1e-10))))
## [1] 11.23223
#partial residuals
resid(res.glm1,type="partial")
##            x
## 1 -2.7392650
## 2 -1.9884824
## 3 -1.6208053
## 4 -0.7569602
## 5  0.7981557
## 6  1.4334872
## 7  2.9858454
## 8  4.1220067
## attr(,"constant")
## [1] 0.7438044
pi = exp(mu)/(1+exp(mu))
(y/n-pi) / (pi*(1-pi)) + res.glm1$coefficients[2]*(x - mean(x))
##          1          2          3          4          5          6          7 
## -2.7392650 -1.9884824 -1.6208053 -0.7569602  0.7981557  1.4334872  2.9858454 
##          8 
##  4.1220067
res.glm2=glm(beetle.mat~x, family=binomial(link="probit"))
summary(res.glm2)
## 
## Call:
## glm(formula = beetle.mat ~ x, family = binomial(link = "probit"))
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -34.935      2.648  -13.19   <2e-16 ***
## x             19.728      1.487   13.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 284.20  on 7  degrees of freedom
## Residual deviance:  10.12  on 6  degrees of freedom
## AIC: 40.318
## 
## Number of Fisher Scoring iterations: 4
fit_p2=c(fitted.values(res.glm2))
fit_y2=n*fit_p2
fit_y2
##         1         2         3         4         5         6         7         8 
##  3.357774 10.721610 23.481932 33.815505 49.615626 53.318874 59.664650 59.227967
res.glm3=glm(beetle.mat~x, family=binomial(link="cloglog"))
summary(res.glm3)
## 
## Call:
## glm(formula = beetle.mat ~ x, family = binomial(link = "cloglog"))
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -39.572      3.240  -12.21   <2e-16 ***
## x             22.041      1.799   12.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 284.2024  on 7  degrees of freedom
## Residual deviance:   3.4464  on 6  degrees of freedom
## AIC: 33.644
## 
## Number of Fisher Scoring iterations: 4
fit_p3=c(fitted.values(res.glm3))
fit_y3=n*fit_p3
fit_y3
##        1        2        3        4        5        6        7        8 
##  5.58945 11.28068 20.95422 30.36944 47.77642 54.14273 61.11331 59.94723
senility
## # A tibble: 54 × 2
##        x     s
##    <dbl> <dbl>
##  1     9     1
##  2    13     1
##  3     6     1
##  4     8     1
##  5    10     1
##  6     4     1
##  7    14     1
##  8     8     1
##  9    11     1
## 10     7     1
## 11     9     1
## 12     7     1
## 13     5     1
## 14    14     1
## 15    13     0
## 16    16     0
## 17    10     0
## 18    12     0
## 19    11     0
## 20    14     0
## 21    15     0
## 22    18     0
## 23     7     0
## 24    16     0
## 25     9     0
## # ℹ 29 more rows
#Table 7.9
senility2 <- senility |> group_by(x) |> 
  mutate(y=sum(s),
         n=n(),
         ) |> distinct(x, .keep_all = TRUE) |> 
  arrange(x) |> ungroup()
senility2
## # A tibble: 17 × 4
##        x     s     y     n
##    <dbl> <dbl> <dbl> <int>
##  1     4     1     1     2
##  2     5     1     1     1
##  3     6     1     1     2
##  4     7     1     2     3
##  5     8     1     2     2
##  6     9     1     2     6
##  7    10     1     1     6
##  8    11     1     1     6
##  9    12     0     0     2
## 10    13     1     1     6
## 11    14     1     2     7
## 12    15     0     0     3
## 13    16     0     0     4
## 14    17     0     0     1
## 15    18     0     0     1
## 16    19     0     0     1
## 17    20     0     0     1
res.glm.senility=glm(cbind(y,n-y)~x,family=binomial(link="logit"),data=senility2)
summary(res.glm.senility)
## 
## Call:
## glm(formula = cbind(y, n - y) ~ x, family = binomial(link = "logit"), 
##     data = senility2)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   2.4040     1.1918   2.017  0.04369 * 
## x            -0.3235     0.1140  -2.838  0.00453 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20.208  on 16  degrees of freedom
## Residual deviance:  9.419  on 15  degrees of freedom
## AIC: 27.792
## 
## Number of Fisher Scoring iterations: 5
#Table 7.9
predicts <- predict(res.glm.senility)
pi_tide2 <- exp(predicts)/(1+exp(predicts))
pearson_residuals <- resid(res.glm.senility, type="pear")
deviance_residuals <- resid(res.glm.senility, type="dev")
senility2 |> mutate(pi_tide=pi_tide2,
                    X=pearson_residuals,
                    d=deviance_residuals)
## # A tibble: 17 × 7
##        x     s     y     n pi_tide      X      d
##    <dbl> <dbl> <dbl> <int>   <dbl>  <dbl>  <dbl>
##  1     4     1     1     2  0.752  -0.826 -0.766
##  2     5     1     1     1  0.687   0.675  0.866
##  3     6     1     1     2  0.614  -0.330 -0.326
##  4     7     1     2     3  0.535   0.458  0.464
##  5     8     1     2     2  0.454   1.55   1.78 
##  6     9     1     2     6  0.376  -0.214 -0.216
##  7    10     1     1     6  0.303  -0.728 -0.771
##  8    11     1     1     6  0.240  -0.419 -0.436
##  9    12     0     0     2  0.186  -0.675 -0.906
## 10    13     1     1     6  0.142   0.176  0.172
## 11    14     1     2     7  0.107   1.53   1.31 
## 12    15     0     0     3  0.0795 -0.509 -0.705
## 13    16     0     0     4  0.0588 -0.500 -0.696
## 14    17     0     0     1  0.0433 -0.213 -0.297
## 15    18     0     0     1  0.0317 -0.181 -0.254
## 16    19     0     0     1  0.0231 -0.154 -0.216
## 17    20     0     0     1  0.0168 -0.131 -0.184
senility2 |> mutate(pi_tide=pi_tide2,
                    X=pearson_residuals,
                    d=deviance_residuals) |> 
  summarize(across(everything(), sum))
## # A tibble: 1 × 7
##       x     s     y     n pi_tide      X     d
##   <dbl> <dbl> <dbl> <int>   <dbl>  <dbl> <dbl>
## 1   204    10    14    54    4.65 -0.486 -1.19
senility3 <- senility2 |> mutate(pi_tide=pi_tide2,
                    X=pearson_residuals,
                    d=deviance_residuals,
                    e=n*pi_tide,
                    n_e=n-e)
sum(senility3$X^2)
## [1] 8.083029
sum(senility3$d^2)
## [1] 9.41897
senility3[senility3$pi_tide <= 0.107,]
## # A tibble: 7 × 9
##       x     s     y     n pi_tide      X      d      e   n_e
##   <dbl> <dbl> <dbl> <int>   <dbl>  <dbl>  <dbl>  <dbl> <dbl>
## 1    14     1     2     7  0.107   1.53   1.31  0.747  6.25 
## 2    15     0     0     3  0.0795 -0.509 -0.705 0.239  2.76 
## 3    16     0     0     4  0.0588 -0.500 -0.696 0.235  3.76 
## 4    17     0     0     1  0.0433 -0.213 -0.297 0.0433 0.957
## 5    18     0     0     1  0.0317 -0.181 -0.254 0.0317 0.968
## 6    19     0     0     1  0.0231 -0.154 -0.216 0.0231 0.977
## 7    20     0     0     1  0.0168 -0.131 -0.184 0.0168 0.983
senility3[senility3$pi_tide > 0.107 & senility3$pi_tide <= 0.304,]
## # A tibble: 4 × 9
##       x     s     y     n pi_tide      X      d     e   n_e
##   <dbl> <dbl> <dbl> <int>   <dbl>  <dbl>  <dbl> <dbl> <dbl>
## 1    10     1     1     6   0.303 -0.728 -0.771 1.82   4.18
## 2    11     1     1     6   0.240 -0.419 -0.436 1.44   4.56
## 3    12     0     0     2   0.186 -0.675 -0.906 0.371  1.63
## 4    13     1     1     6   0.142  0.176  0.172 0.850  5.15
senility3[senility3$pi_tide > 0.304,]
## # A tibble: 6 × 9
##       x     s     y     n pi_tide      X      d     e   n_e
##   <dbl> <dbl> <dbl> <int>   <dbl>  <dbl>  <dbl> <dbl> <dbl>
## 1     4     1     1     2   0.752 -0.826 -0.766 1.50  0.496
## 2     5     1     1     1   0.687  0.675  0.866 0.687 0.313
## 3     6     1     1     2   0.614 -0.330 -0.326 1.23  0.773
## 4     7     1     2     3   0.535  0.458  0.464 1.60  1.40 
## 5     8     1     2     2   0.454  1.55   1.78  0.908 1.09 
## 6     9     1     2     6   0.376 -0.214 -0.216 2.25  3.75
#Table7.10
senility3[senility3$pi_tide <= 0.107,] |> summarize(across(1:9, sum)) |> as.data.frame()
##     x s y  n   pi_tide          X         d        e      n_e
## 1 119 1 2 18 0.3599508 -0.1527076 -1.047855 1.335407 16.66459
senility3[senility3$pi_tide > 0.107 & senility3$pi_tide <= 0.304,] |> summarize(across(1:9, sum)) |> as.data.frame()
##    x s y  n   pi_tide         X         d        e      n_e
## 1 46 3 3 20 0.8703006 -1.646441 -1.941109 4.479079 15.52092
senility3[senility3$pi_tide > 0.304,] |> summarize(across(1:9, sum)) |> as.data.frame()
##    x s y  n  pi_tide        X        d        e      n_e
## 1 39 6 9 16 3.417445 1.313162 1.799157 8.185514 7.814486
X_HL=sum((2-1.335)^2/1.335,
         (3-4.479)^2/4.479,
         (9-8.186)^2/8.186,
         (16-16.665)^2/16.665,
         (17-15.521)^2/15.521,
         (7-7.814)^2/7.814
         )
X_HL
## [1] 1.152841
1-pchisq(X_HL, df=1)
## [1] 0.2829553
res.glm.senility.mini=glm(cbind(y,n-y)~1,family=binomial(link="logit"),data=senility2)
summary(res.glm.senility.mini)
## 
## Call:
## glm(formula = cbind(y, n - y) ~ 1, family = binomial(link = "logit"), 
##     data = senility2)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.0498     0.3105  -3.381 0.000723 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20.208  on 16  degrees of freedom
## Residual deviance: 20.208  on 16  degrees of freedom
## AIC: 36.581
## 
## Number of Fisher Scoring iterations: 3
#AIC
AIC(res.glm.senility.mini)
## [1] 36.5808
-2*c(logLik(res.glm.senility.mini))+2*1
## [1] 36.5808
#BIC
BIC(res.glm.senility.mini)
## [1] 37.41401
res.glm.senility.mini$aic-2*1+1*log(length(senility2$y))
## [1] 37.41401
predict(res.glm.senility.mini)
##         1         2         3         4         5         6         7         8 
## -1.049822 -1.049822 -1.049822 -1.049822 -1.049822 -1.049822 -1.049822 -1.049822 
##         9        10        11        12        13        14        15        16 
## -1.049822 -1.049822 -1.049822 -1.049822 -1.049822 -1.049822 -1.049822 -1.049822 
##        17 
## -1.049822
constant <- sum(log(choose(senility2$n, senility2$y)))
constant
## [1] 13.61276
logLik(res.glm.senility)
## 'log Lik.' -11.89593 (df=2)
logLik(res.glm.senility.mini)
## 'log Lik.' -17.2904 (df=1)
#pseudo R2
R2 <- (logLik(res.glm.senility.mini)-logLik(res.glm.senility))/(logLik(res.glm.senility.mini)-constant)
R2
## 'log Lik.' 0.1745604 (df=1)
#ungrouped
res.glm.senility.ungrouped=glm(s~x,family=binomial(link="logit"),data=senility)
summary(res.glm.senility.ungrouped)
## 
## Call:
## glm(formula = s ~ x, family = binomial(link = "logit"), data = senility)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   2.4040     1.1918   2.017  0.04369 * 
## x            -0.3235     0.1140  -2.838  0.00453 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 61.806  on 53  degrees of freedom
## Residual deviance: 51.017  on 52  degrees of freedom
## AIC: 55.017
## 
## Number of Fisher Scoring iterations: 5
logLik(res.glm.senility.ungrouped)
## 'log Lik.' -25.50869 (df=2)
#ungrouped
res.glm.senility.ungrouped.mini=glm(s~1,family=binomial(link="logit"),data=senility)
summary(res.glm.senility.ungrouped.mini)
## 
## Call:
## glm(formula = s ~ 1, family = binomial(link = "logit"), data = senility)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.0498     0.3105  -3.381 0.000723 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 61.806  on 53  degrees of freedom
## Residual deviance: 61.806  on 53  degrees of freedom
## AIC: 63.806
## 
## Number of Fisher Scoring iterations: 4
logLik(res.glm.senility.ungrouped.mini)
## 'log Lik.' -30.90316 (df=1)
data(anthers)
anthers
## # A tibble: 6 × 4
##       y     n storage centrifuge
##   <dbl> <dbl>   <dbl>      <dbl>
## 1    55   102       1         40
## 2    52    99       1        150
## 3    57   108       1        350
## 4    55    76       2         40
## 5    50    81       2        150
## 6    50    90       2        350
#model2 different Intercept and different slope
j <- anthers$storage
newstor <- j-1
x <- log(anthers$centrifuge)
res.glm1=glm(cbind(y,n-y)~newstor*x,family=binomial(link="logit"),data=anthers)
summary(res.glm1)
## 
## Call:
## glm(formula = cbind(y, n - y) ~ newstor * x, family = binomial(link = "logit"), 
##     data = anthers)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  0.23389    0.62839   0.372   0.7097  
## newstor      1.97711    0.99802   1.981   0.0476 *
## x           -0.02274    0.12685  -0.179   0.8577  
## newstor:x   -0.31862    0.19888  -1.602   0.1091  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10.451974  on 5  degrees of freedom
## Residual deviance:  0.027728  on 2  degrees of freedom
## AIC: 37.596
## 
## Number of Fisher Scoring iterations: 3
#model2 different Intercept and shared slope
data(anthers)
j <- anthers$storage
newstor <- j-1
x <- log(anthers$centrifuge)
res.glm2=glm(cbind(y,n-y)~newstor+x,family=binomial(link="logit"),data=anthers)
summary(res.glm2)
## 
## Call:
## glm(formula = cbind(y, n - y) ~ newstor + x, family = binomial(link = "logit"), 
##     data = anthers)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  0.87673    0.48701   1.800   0.0718 .
## newstor      0.40684    0.17462   2.330   0.0198 *
## x           -0.15459    0.09702  -1.593   0.1111  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10.4520  on 5  degrees of freedom
## Residual deviance:  2.6188  on 3  degrees of freedom
## AIC: 38.187
## 
## Number of Fisher Scoring iterations: 3
#model3 shared Intercept and slope
data(anthers)
x <- log(anthers$centrifuge)
res.glm3=glm(cbind(y,n-y)~x,family=binomial(link="logit"),data=anthers)
summary(res.glm3)
## 
## Call:
## glm(formula = cbind(y, n - y) ~ x, family = binomial(link = "logit"), 
##     data = anthers)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   1.0213     0.4813   2.122   0.0338 *
## x            -0.1478     0.0965  -1.532   0.1255  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10.4520  on 5  degrees of freedom
## Residual deviance:  8.0916  on 4  degrees of freedom
## AIC: 41.66
## 
## Number of Fisher Scoring iterations: 3
#fitted model1
fit_p1=c(fitted.values(res.glm1))
fit_y1=anthers$n*fit_p1
fit_y1
##        1        2        3        4        5        6 
## 54.81819 52.46542 56.71639 54.83190 50.43032 49.73778
#fitted model2
fit_p2=c(fitted.values(res.glm2))
fit_y2=anthers$n*fit_p2
fit_y2
##        1        2        3        4        5        6 
## 58.75401 52.02530 53.22069 51.00579 50.58959 53.40461
#fitted model2
fit_p3=c(fitted.values(res.glm3))
fit_y3=anthers$n*fit_p3
fit_y3
##        1        2        3        4        5        6 
## 62.91247 56.39788 58.18362 46.87596 46.14372 48.48635
res.glm1$null.deviance
## [1] 10.45197
res.glm1$deviance
## [1] 0.0277278
res.glm2$null.deviance
## [1] 10.45197
res.glm2$deviance
## [1] 2.618837
res.glm3$null.deviance
## [1] 10.45197
res.glm3$deviance
## [1] 8.091578
#model3 shared Intercept and slope
data(senility)
senility
## # A tibble: 54 × 2
##        x     s
##    <dbl> <dbl>
##  1     9     1
##  2    13     1
##  3     6     1
##  4     8     1
##  5    10     1
##  6     4     1
##  7    14     1
##  8     8     1
##  9    11     1
## 10     7     1
## 11     9     1
## 12     7     1
## 13     5     1
## 14    14     1
## 15    13     0
## 16    16     0
## 17    10     0
## 18    12     0
## 19    11     0
## 20    14     0
## 21    15     0
## 22    18     0
## 23     7     0
## 24    16     0
## 25     9     0
## # ℹ 29 more rows
library(tidyverse)
senility2 <- senility |> 
  group_by(x) |> 
  summarize(y = sum(s),
            n = n())

senility2
## # A tibble: 17 × 3
##        x     y     n
##    <dbl> <dbl> <int>
##  1     4     1     2
##  2     5     1     1
##  3     6     1     2
##  4     7     2     3
##  5     8     2     2
##  6     9     2     6
##  7    10     1     6
##  8    11     1     6
##  9    12     0     2
## 10    13     1     6
## 11    14     2     7
## 12    15     0     3
## 13    16     0     4
## 14    17     0     1
## 15    18     0     1
## 16    19     0     1
## 17    20     0     1
res.glm=glm(cbind(y,n-y)~x,family=binomial(link="logit"),data=senility2)
summary(res.glm)
## 
## Call:
## glm(formula = cbind(y, n - y) ~ x, family = binomial(link = "logit"), 
##     data = senility2)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   2.4040     1.1918   2.017  0.04369 * 
## x            -0.3235     0.1140  -2.838  0.00453 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20.208  on 16  degrees of freedom
## Residual deviance:  9.419  on 15  degrees of freedom
## AIC: 27.792
## 
## Number of Fisher Scoring iterations: 5
res.glm$deviance
## [1] 9.41897
pchisq(res.glm$deviance, df=15)
## [1] 0.1453876
min(senility2$x)
## [1] 4
max(senility2$x)
## [1] 20
x2 <- data.frame(x = seq(min(senility2$x), max(senility2$x), by=0.1))
predict <- predict(res.glm, x2, type="response")
predict2 <- cbind(x2, data.frame(predict))
head(predict2)
##     x   predict
## 1 4.0 0.7521145
## 2 4.1 0.7460336
## 3 4.2 0.7398551
## 4 4.3 0.7335800
## 5 4.4 0.7272093
## 6 4.5 0.7207443
senility3 <- senility2 |> 
  mutate(
    proportion = y/n
  )
senility3
## # A tibble: 17 × 4
##        x     y     n proportion
##    <dbl> <dbl> <int>      <dbl>
##  1     4     1     2      0.5  
##  2     5     1     1      1    
##  3     6     1     2      0.5  
##  4     7     2     3      0.667
##  5     8     2     2      1    
##  6     9     2     6      0.333
##  7    10     1     6      0.167
##  8    11     1     6      0.167
##  9    12     0     2      0    
## 10    13     1     6      0.167
## 11    14     2     7      0.286
## 12    15     0     3      0    
## 13    16     0     4      0    
## 14    17     0     1      0    
## 15    18     0     1      0    
## 16    19     0     1      0    
## 17    20     0     1      0
ggplot(senility3, aes(x = x, y = proportion)) + 
  geom_point(size=2) +
  geom_line(data=predict2, aes(x = x, y = predict)) + theme_bw() + theme(
  # Hide panel borders and remove grid lines
  #panel.border = element_line(colour = "black"),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  # Change axis line
  #axis.line = element_line(colour = "black")
) + scale_x_continuous(limits = c(4, 20), expand = c(0, 0))

res.glm$coefficients[1]
## (Intercept) 
##    2.404043
senility3 |> 
  mutate(estimate = exp(res.glm$coefficients[1]+res.glm$coefficients[2]*x)/(1+exp(res.glm$coefficients[1]+res.glm$coefficients[2]*x))) |> 
  mutate(X = (y-n*estimate)/sqrt(n*estimate*(1-estimate)))
## # A tibble: 17 × 6
##        x     y     n proportion estimate      X
##    <dbl> <dbl> <int>      <dbl>    <dbl>  <dbl>
##  1     4     1     2      0.5     0.752  -0.826
##  2     5     1     1      1       0.687   0.675
##  3     6     1     2      0.5     0.614  -0.330
##  4     7     2     3      0.667   0.535   0.458
##  5     8     2     2      1       0.454   1.55 
##  6     9     2     6      0.333   0.376  -0.214
##  7    10     1     6      0.167   0.303  -0.728
##  8    11     1     6      0.167   0.240  -0.419
##  9    12     0     2      0       0.186  -0.675
## 10    13     1     6      0.167   0.142   0.176
## 11    14     2     7      0.286   0.107   1.53 
## 12    15     0     3      0       0.0795 -0.509
## 13    16     0     4      0       0.0588 -0.500
## 14    17     0     1      0       0.0433 -0.213
## 15    18     0     1      0       0.0317 -0.181
## 16    19     0     1      0       0.0231 -0.154
## 17    20     0     1      0       0.0168 -0.131
res.glm=glm(s~x,family=binomial(link="logit"),data=senility)
summary(res.glm)
## 
## Call:
## glm(formula = s ~ x, family = binomial(link = "logit"), data = senility)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   2.4040     1.1918   2.017  0.04369 * 
## x            -0.3235     0.1140  -2.838  0.00453 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 61.806  on 53  degrees of freedom
## Residual deviance: 51.017  on 52  degrees of freedom
## AIC: 55.017
## 
## Number of Fisher Scoring iterations: 5
library(doBy)
waisgrp=summaryBy(s~x,data=senility,FUN=c(sum,length))
names(waisgrp)=c('x','y','n')
waisgrp
## # A tibble: 17 × 3
##        x     y     n
##    <dbl> <dbl> <dbl>
##  1     4     1     2
##  2     5     1     1
##  3     6     1     2
##  4     7     2     3
##  5     8     2     2
##  6     9     2     6
##  7    10     1     6
##  8    11     1     6
##  9    12     0     2
## 10    13     1     6
## 11    14     2     7
## 12    15     0     3
## 13    16     0     4
## 14    17     0     1
## 15    18     0     1
## 16    19     0     1
## 17    20     0     1
res.glm=glm(cbind(y, n-y)~x,family=binomial(link="logit"), data=waisgrp)
summary(res.glm)
## 
## Call:
## glm(formula = cbind(y, n - y) ~ x, family = binomial(link = "logit"), 
##     data = waisgrp)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   2.4040     1.1918   2.017  0.04369 * 
## x            -0.3235     0.1140  -2.838  0.00453 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20.208  on 16  degrees of freedom
## Residual deviance:  9.419  on 15  degrees of freedom
## AIC: 27.792
## 
## Number of Fisher Scoring iterations: 5
anthers
## # A tibble: 6 × 4
##       y     n storage centrifuge
##   <dbl> <dbl>   <dbl>      <dbl>
## 1    55   102       1         40
## 2    52    99       1        150
## 3    57   108       1        350
## 4    55    76       2         40
## 5    50    81       2        150
## 6    50    90       2        350
anthers.sum <- aggregate(anthers[c("n","y")], by = anthers[c("storage")], FUN=sum)
anthers.sum
##   storage   n   y
## 1       1 309 164
## 2       2 247 155
dput(anthers.sum)
## structure(list(storage = c(1, 2), n = c(309, 247), y = c(164, 
## 155)), row.names = c(NA, -2L), class = "data.frame")
anthers.sum <- structure(list(storage = c(1, 2), n = c(309, 247), y = c(164, 
155)), row.names = c(NA, -2L), class = "data.frame")
logit_model <- glm(cbind(y, n-y) ~ storage, data=anthers.sum, family=binomial(link='logit'))
log_model <- glm(cbind(y, n-y) ~ storage, data=anthers.sum, family=binomial(link='log'))
logit_model$coefficients
## (Intercept)     storage 
##  -0.2753712   0.3985039
log_model$coefficients
## (Intercept)     storage 
##  -0.8009865   0.1675116
summary(logit_model)
## 
## Call:
## glm(formula = cbind(y, n - y) ~ storage, family = binomial(link = "logit"), 
##     data = anthers.sum)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -0.2754     0.2632  -1.046   0.2955  
## storage       0.3985     0.1741   2.289   0.0221 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  5.2790e+00  on 1  degrees of freedom
## Residual deviance: -3.7748e-14  on 0  degrees of freedom
## AIC: 16.079
## 
## Number of Fisher Scoring iterations: 2
stats::confint(logit_model)
##                   2.5 %    97.5 %
## (Intercept) -0.79161864 0.2410017
## storage      0.05841082 0.7414014

Newton method:

the log-likelihood function is \[l=\sum_{i=1}^{N}\left[y_i(\beta_1+\beta_2x_i)-n_i\log\left[1+\exp(\beta_1+\beta_2x_i)\right]+\log{n_i \choose y_i}\right]\] the score matrix is \[U=\left[\begin{matrix} \frac{\partial l}{\partial\beta_1} \\ \frac{\partial l}{\partial\beta_2} \end{matrix}\right]=\left[\begin{matrix} \sum(y_i-n_i\pi_i) \\ \sum x_i(y_i-n_i\pi_i) \end{matrix}\right]\]
\[\pi_i=\frac{\exp(\beta_1+\beta_2x_i)}{1+\exp(\beta_1+\beta_2x_i)}\]

the information matrix is

\[J=\left[\begin{matrix} \sum n_i\pi_i(1-\pi_i) & \sum n_ix_i\pi_i(1-\pi_i) \\ \sum n_ix_i\pi_i(1-\pi_i) & \sum n_ix_i^2\pi_i(1-\pi_i) \end{matrix}\right]\]

The iteration is :

\[J^{(m-1)} b^m=J^{(m-1)} b^{(m-1)}+U^{(m-1)}\]

For a log-likelihood function of a single parameter \(\beta\) , the first three terms of the Taylor series approximation near an estimate \(b\) are

\[\begin{align}l(\beta)&\approx l(b)+(\beta-b)U(b)+\frac{1}{2}(\beta-b)^2U'(b)\\ &=l(b)+(\beta-b)U(b)-\frac{1}{2}(\beta-b)^2J(b)\\ &=-\frac{1}{2}(\beta-b)^2J(b) \end{align}\] when \(U(b)\) is the maximum likelihood estimate, \(U(b)=0\) Then \[l(\beta)-l(b)=-\frac{1}{2}(\beta-b)^2J(b)\] and because \[\mathbb E[(b-\beta)(b-\beta)^T]=J^{-1}\mathbb E(UU^T)J^{-1}=J^{-1}\] because \[J=\mathbb E(UU^T)\] then \[2[l(\beta)-l(b)]=(b-\beta)^2J(b)\sim \chi^2(p)\]

For the one-parameter case, \[b\sim N(\beta, J^{-1})\]

X <- matrix(c(1,1,1,2), ncol=2,
            byrow = TRUE)
B=matrix(c(0, 0), ncol = 1, byrow = FALSE)
n <- anthers.sum$n
y <- anthers.sum$y
x <- anthers.sum$storage
for(i in seq(1:6)){
  pii = exp(X %*% B)/(1+exp(X %*% B))
  npi1_pi = n*pii*(1-pii)
  U=matrix(c(sum(y-n*pii), 
             sum((y-n*pii)*x)),
           ncol = 1, byrow = FALSE)
  J=matrix(c(sum(npi1_pi),
             sum(npi1_pi*x),
             sum(npi1_pi*x),
             sum(npi1_pi*x*x)),
           ncol=2,
           byrow = TRUE)
  J_1 = solve(J)
  B=J_1 %*% (crossprod(J, B)+U)
  print(paste0("iter: ", i))
  print("coefficients:")
  print(B)
  print("variances:")
  print(diag(J_1))
  print("std.error:")
  print(sqrt(diag(J_1)))
  writeLines("\n")
}
## [1] "iter: 1"
## [1] "coefficients:"
##            [,1]
## [1,] -0.2641668
## [2,]  0.3871441
## [1] "variances:"
## [1] 0.06797427 0.02913932
## [1] "std.error:"
## [1] 0.2607188 0.1707024
## 
## 
## [1] "iter: 2"
## [1] "coefficients:"
##            [,1]
## [1,] -0.2753545
## [2,]  0.3984872
## [1] "variances:"
## [1] 0.06924687 0.03026490
## [1] "std.error:"
## [1] 0.2631480 0.1739681
## 
## 
## [1] "iter: 3"
## [1] "coefficients:"
##            [,1]
## [1,] -0.2753712
## [2,]  0.3985039
## [1] "variances:"
## [1] 0.06929756 0.03031522
## [1] "std.error:"
## [1] 0.2632443 0.1741127
## 
## 
## [1] "iter: 4"
## [1] "coefficients:"
##            [,1]
## [1,] -0.2753712
## [2,]  0.3985039
## [1] "variances:"
## [1] 0.06929763 0.03031529
## [1] "std.error:"
## [1] 0.2632444 0.1741129
## 
## 
## [1] "iter: 5"
## [1] "coefficients:"
##            [,1]
## [1,] -0.2753712
## [2,]  0.3985039
## [1] "variances:"
## [1] 0.06929763 0.03031529
## [1] "std.error:"
## [1] 0.2632444 0.1741129
## 
## 
## [1] "iter: 6"
## [1] "coefficients:"
##            [,1]
## [1,] -0.2753712
## [2,]  0.3985039
## [1] "variances:"
## [1] 0.06929763 0.03031529
## [1] "std.error:"
## [1] 0.2632444 0.1741129
#pi_hat
fitted.values(logit_model)
##         1         2 
## 0.5307443 0.6275304
#linear values
predict(logit_model)
##         1         2 
## 0.1231327 0.5216365
#pi_hat
exp(predict(logit_model))/(1+exp(predict(logit_model)))
##         1         2 
## 0.5307443 0.6275304
x <- anthers.sum$storage
#Logit model odds ratios Constant
exp(coefficients(summary(logit_model))["(Intercept)",1]+coefficients(summary(logit_model))["storage",1]*x[1])
## [1] 1.131034
#Logit model odds ratios Treatment vs. control
exp(coefficients(summary(logit_model))["storage",1]*(x[2]-x[1]))
## [1] 1.489594
questionr::odds.ratio(logit_model, level=0.95)
##                  OR   2.5 % 97.5 %       p  
## (Intercept) 0.75929 0.45311 1.2725 0.29553  
## storage     1.48959 1.06015 2.0989 0.02209 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coefficients(summary(logit_model))
##               Estimate Std. Error   z value   Pr(>|z|)
## (Intercept) -0.2753712  0.2632444 -1.046066 0.29553034
## storage      0.3985039  0.1741129  2.288767 0.02209287
#95% CI of constant
exp(coefficients(summary(logit_model))["(Intercept)",1]+qnorm(c(0.025,0.975))*coefficients(summary(logit_model))["(Intercept)",2])
## [1] 0.4532458 1.2719847
#95% CI of treatment vs constant
exp(coefficients(summary(logit_model))["storage",1]+qnorm(c(0.025,0.975))*coefficients(summary(logit_model))["storage",2])
## [1] 1.058919 2.095430
#Exercises 7.1
hiroshima
## # A tibble: 6 × 4
##   radiation  leukemia `other cancer` `total cancers`
##   <chr>         <dbl>          <dbl>           <dbl>
## 1 0                13            378             391
## 2 1 to 9            5            200             205
## 3 10 to 49          5            151             156
## 4 50 to 99          3             47              50
## 5 100 to 199        4             31              35
## 6 200 +            18             33              51
hiroshima2 <- hiroshima |> mutate(dosage=as.integer(str_split(hiroshima$radiation, " ") |> map_chr(\(x) x[1])))
hiroshima2
## # A tibble: 6 × 5
##   radiation  leukemia `other cancer` `total cancers` dosage
##   <chr>         <dbl>          <dbl>           <dbl>  <int>
## 1 0                13            378             391      0
## 2 1 to 9            5            200             205      1
## 3 10 to 49          5            151             156     10
## 4 50 to 99          3             47              50     50
## 5 100 to 199        4             31              35    100
## 6 200 +            18             33              51    200
model.hiroshima <- glm(cbind(leukemia, `other cancer`) ~ dosage, data=hiroshima2, family=binomial(link='logit'))
summary(model.hiroshima)
## 
## Call:
## glm(formula = cbind(leukemia, `other cancer`) ~ dosage, family = binomial(link = "logit"), 
##     data = hiroshima2)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.488973   0.204062 -17.098  < 2e-16 ***
## dosage       0.014410   0.001817   7.932 2.15e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 54.35089  on 5  degrees of freedom
## Residual deviance:  0.43206  on 4  degrees of freedom
## AIC: 26.097
## 
## Number of Fisher Scoring iterations: 4
n_total <- hiroshima2$`total cancers`
observed <- hiroshima2$leukemia
predict(model.hiroshima)
##          1          2          3          4          5          6 
## -3.4889730 -3.4745628 -3.3448706 -2.7684611 -2.0479491 -0.6069252
fitted.values(model.hiroshima)*n_total
##         1         2         3         4         5         6 
## 11.584398  6.159169  5.313911  2.952623  3.999092 17.990807
exp(predict(model.hiroshima))*n_total/(1+exp(predict(model.hiroshima)))
##         1         2         3         4         5         6 
## 11.584398  6.159169  5.313911  2.952623  3.999092 17.990807
estimated <- fitted.values(model.hiroshima)*n_total
hiroshima2 <- hiroshima2 |> mutate(OL=observed,
                     OO=n_total-observed,
                     EL=estimated,
                     EO=n_total-estimated)
hiroshima2
## # A tibble: 6 × 9
##   radiation  leukemia `other cancer` `total cancers` dosage    OL    OO    EL
##   <chr>         <dbl>          <dbl>           <dbl>  <int> <dbl> <dbl> <dbl>
## 1 0                13            378             391      0    13   378 11.6 
## 2 1 to 9            5            200             205      1     5   200  6.16
## 3 10 to 49          5            151             156     10     5   151  5.31
## 4 50 to 99          3             47              50     50     3    47  2.95
## 5 100 to 199        4             31              35    100     4    31  4.00
## 6 200 +            18             33              51    200    18    33 18.0 
## # ℹ 1 more variable: EO <dbl>
hiroshima2[hiroshima2$dosage == 0,]
## # A tibble: 1 × 9
##   radiation leukemia `other cancer` `total cancers` dosage    OL    OO    EL
##   <chr>        <dbl>          <dbl>           <dbl>  <int> <dbl> <dbl> <dbl>
## 1 0               13            378             391      0    13   378  11.6
## # ℹ 1 more variable: EO <dbl>
hiroshima2[hiroshima2$dosage > 0 & hiroshima2$dosage <= 50,]
## # A tibble: 3 × 9
##   radiation leukemia `other cancer` `total cancers` dosage    OL    OO    EL
##   <chr>        <dbl>          <dbl>           <dbl>  <int> <dbl> <dbl> <dbl>
## 1 1 to 9           5            200             205      1     5   200  6.16
## 2 10 to 49         5            151             156     10     5   151  5.31
## 3 50 to 99         3             47              50     50     3    47  2.95
## # ℹ 1 more variable: EO <dbl>
hiroshima2[hiroshima2$dosage > 50,]
## # A tibble: 2 × 9
##   radiation  leukemia `other cancer` `total cancers` dosage    OL    OO    EL
##   <chr>         <dbl>          <dbl>           <dbl>  <int> <dbl> <dbl> <dbl>
## 1 100 to 199        4             31              35    100     4    31  4.00
## 2 200 +            18             33              51    200    18    33 18.0 
## # ℹ 1 more variable: EO <dbl>
hiroshima2[hiroshima2$dosage == 0,] |> summarize(across(6:9, sum)) |> as.data.frame()
##   OL  OO      EL       EO
## 1 13 378 11.5844 379.4156
hiroshima2[hiroshima2$dosage > 0 & hiroshima2$dosage <= 50,] |> summarize(across(6:9, sum)) |> as.data.frame()
##   OL  OO      EL       EO
## 1 13 398 14.4257 396.5743
hiroshima2[hiroshima2$dosage > 50,] |> summarize(across(6:9, sum)) |> as.data.frame()
##   OL OO      EL      EO
## 1 22 64 21.9899 64.0101
#XHL^2
X_HL <- sum(c((13-11.5844)^2/11.5844,
                (378-379.4156)^2/379.4156,
                (13-14.4257)^2/14.4257,
                (398-396.5743)^2/396.5743,
                (22-21.9899)^2/21.9899,
                (64-64.0101)^2/64.0101
                ))
X_HL
## [1] 0.3243007
1-pchisq(X_HL, df=1)
## [1] 0.5690345
#Exercises 7.3
graduates
## # A tibble: 60 × 5
##     year survive total faculty  sex  
##    <dbl> <chr>   <chr> <chr>    <chr>
##  1  1938 18      22    medicine men  
##  2  1939 16      23    medicine men  
##  3  1940 7       17    medicine men  
##  4  1941 12      25    medicine men  
##  5  1942 24      50    medicine men  
##  6  1943 16      21    medicine men  
##  7  1944 22      32    medicine men  
##  8  1945 12      14    medicine men  
##  9  1946 22      34    medicine men  
## 10  1947 28      37    medicine men  
## 11  1938 16      30    arts     men  
## 12  1939 13      22    arts     men  
## 13  1940 11      25    arts     men  
## 14  1941 12      14    arts     men  
## 15  1942 8       12    arts     men  
## 16  1943 11      20    arts     men  
## 17  1944 4       10    arts     men  
## 18  1945 4       12    arts     men  
## 19  1946 *       *     arts     men  
## 20  1947 13      23    arts     men  
## 21  1938 9       14    science  men  
## 22  1939 9       12    science  men  
## 23  1940 12      19    science  men  
## 24  1941 12      15    science  men  
## 25  1942 20      28    science  men  
## # ℹ 35 more rows
graduates$survive = as.numeric(graduates$survive)
graduates$total = as.numeric(graduates$total)
graduates
## # A tibble: 60 × 5
##     year survive total faculty  sex  
##    <dbl>   <dbl> <dbl> <chr>    <chr>
##  1  1938      18    22 medicine men  
##  2  1939      16    23 medicine men  
##  3  1940       7    17 medicine men  
##  4  1941      12    25 medicine men  
##  5  1942      24    50 medicine men  
##  6  1943      16    21 medicine men  
##  7  1944      22    32 medicine men  
##  8  1945      12    14 medicine men  
##  9  1946      22    34 medicine men  
## 10  1947      28    37 medicine men  
## 11  1938      16    30 arts     men  
## 12  1939      13    22 arts     men  
## 13  1940      11    25 arts     men  
## 14  1941      12    14 arts     men  
## 15  1942       8    12 arts     men  
## 16  1943      11    20 arts     men  
## 17  1944       4    10 arts     men  
## 18  1945       4    12 arts     men  
## 19  1946      NA    NA arts     men  
## 20  1947      13    23 arts     men  
## 21  1938       9    14 science  men  
## 22  1939       9    12 science  men  
## 23  1940      12    19 science  men  
## 24  1941      12    15 science  men  
## 25  1942      20    28 science  men  
## # ℹ 35 more rows
aggregate(graduates[c("survive","total")], by = graduates[c("sex")], FUN=sum, na.rm = TRUE) |> mutate(ratio=survive/total)
##     sex survive total     ratio
## 1   men     536   799 0.6708385
## 2 women     184   218 0.8440367
graduates$year <- factor(graduates$year)
model.graduates <- glm(cbind(survive, total-survive) ~ year, data=graduates, family=binomial(link='logit'),
                       na.action = na.omit)
summary(model.graduates)
## 
## Call:
## glm(formula = cbind(survive, total - survive) ~ year, family = binomial(link = "logit"), 
##     data = graduates, na.action = na.omit)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.69315    0.21004   3.300 0.000967 ***
## year1939     0.06899    0.31064   0.222 0.824238    
## year1940    -0.18760    0.29378  -0.639 0.523103    
## year1941     0.21511    0.31662   0.679 0.496882    
## year1942    -0.17261    0.28810  -0.599 0.549076    
## year1943     0.43672    0.32729   1.334 0.182096    
## year1944     0.34093    0.29407   1.159 0.246323    
## year1945     0.62614    0.30175   2.075 0.037988 *  
## year1946     0.07411    0.39592   0.187 0.851521    
## year1947     0.40547    0.28143   1.441 0.149667    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 116.09  on 57  degrees of freedom
## Residual deviance: 101.04  on 48  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 269.78
## 
## Number of Fisher Scoring iterations: 4
graduates$sex <- factor(graduates$sex)
graduates$faculty <- factor(graduates$faculty)
model.graduates2 <- glm(cbind(survive, total-survive) ~ faculty, 
                       data=graduates, family=binomial(link='logit'),
                       na.action = na.omit)
summary(model.graduates2)
## 
## Call:
## glm(formula = cbind(survive, total - survive) ~ faculty, family = binomial(link = "logit"), 
##     data = graduates, na.action = na.omit)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          0.7117     0.1180   6.029 1.65e-09 ***
## facultyengineering   0.2595     0.2220   1.169  0.24243    
## facultymedicine     -0.1205     0.1726  -0.698  0.48511    
## facultyscience       0.7207     0.1931   3.731  0.00019 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 116.086  on 57  degrees of freedom
## Residual deviance:  94.225  on 54  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 250.96
## 
## Number of Fisher Scoring iterations: 4
model.graduates.sex <- glm(cbind(survive, total-survive) ~ sex, 
                       data=graduates, family=binomial(link='logit'),
                       na.action = na.omit)
summary(model.graduates.sex)
## 
## Call:
## glm(formula = cbind(survive, total - survive) ~ sex, family = binomial(link = "logit"), 
##     data = graduates, na.action = na.omit)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.71198    0.07529   9.457  < 2e-16 ***
## sexwomen     0.97660    0.20128   4.852 1.22e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 116.09  on 57  degrees of freedom
## Residual deviance:  88.84  on 56  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 241.58
## 
## Number of Fisher Scoring iterations: 4
model.graduates.interact <- glm(cbind(survive, total-survive) ~ faculty*sex, 
                       data=graduates, family=binomial(link='logit'),
                       na.action = na.omit)
summary(model.graduates.interact)
## 
## Call:
## glm(formula = cbind(survive, total - survive) ~ faculty * sex, 
##     family = binomial(link = "logit"), data = graduates, na.action = na.omit)
## 
## Coefficients: (2 not defined because of singularities)
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   0.1911     0.1550   1.233  0.21774    
## facultyengineering            0.7801     0.2437   3.201  0.00137 ** 
## facultymedicine               0.4001     0.1997   2.004  0.04511 *  
## facultyscience                0.9968     0.2239   4.452 8.50e-06 ***
## sexwomen                      1.2112     0.2534   4.780 1.76e-06 ***
## facultyengineering:sexwomen       NA         NA      NA       NA    
## facultymedicine:sexwomen          NA         NA      NA       NA    
## facultyscience:sexwomen       0.5627     0.6640   0.848  0.39671    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 116.086  on 57  degrees of freedom
## Residual deviance:  56.765  on 52  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 217.5
## 
## Number of Fisher Scoring iterations: 5
women <- graduates[graduates$sex == "women",]
men <- graduates[graduates$sex == "men",]
model.graduates.women <- glm(cbind(survive, total-survive) ~ faculty, 
                       data=women, family=binomial(link='logit'),
                       na.action = na.omit)
summary(model.graduates.women)
## 
## Call:
## glm(formula = cbind(survive, total - survive) ~ faculty, family = binomial(link = "logit"), 
##     data = women, na.action = na.omit)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      1.4023     0.2005   6.994 2.66e-12 ***
## facultyscience   1.5595     0.6251   2.495   0.0126 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22.555  on 19  degrees of freedom
## Residual deviance: 13.739  on 18  degrees of freedom
## AIC: 46.642
## 
## Number of Fisher Scoring iterations: 5
model.graduates.men <- glm(cbind(survive, total-survive) ~ faculty, 
                       data=men, family=binomial(link='logit'),
                       na.action = na.omit)
summary(model.graduates.men)
## 
## Call:
## glm(formula = cbind(survive, total - survive) ~ faculty, family = binomial(link = "logit"), 
##     data = men, na.action = na.omit)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          0.1911     0.1550   1.233  0.21774    
## facultyengineering   0.7801     0.2437   3.201  0.00137 ** 
## facultymedicine      0.4001     0.1997   2.004  0.04511 *  
## facultyscience       0.9968     0.2239   4.452  8.5e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 66.285  on 37  degrees of freedom
## Residual deviance: 43.026  on 34  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 170.86
## 
## Number of Fisher Scoring iterations: 4

Chapter8 Nominal and Ordinal Logistic Regression

8.2 Multinomial distribution

If \((X_1,\cdots, X_n)\) is a vector with multinomial distribution, proof that \(\text{Cov}(X_i,X_j)=-rp_ip_j\), \(i\neq j\) where \(r\) is the number of trials of the experiment, \(p_i\) is the probability of success for the variable \(X_i\). \[fdp=f(x_1,...x_n)={r!\over{x_1!x_2!\cdots x_n!}}p_1^{x_1}\cdots p_n^{x_n} \] if \(x_1+x_2+\cdots +x_n=r\)

We can use indicator random variables to help simplify the covariance expression. We can interpret the problem as \(r\) independent rolls of an \(n\) sided die. Let \(X_i\) be the number of rolls that result in side \(i\) facing up, and let \(I_{k}^{(i)}\) be an indicator equal to \(1\) when roll \(k\) is equal to \(i\) and \(0\) otherwise. Then, we can express \(X_i\) and \(X_j\) as follows:

\[\begin{equation} X_i = \sum_{k=1}^{r} I_{k}^{(i)}~~~\mathrm{and}~~~X_j = \sum_{k=1}^{r} I_{k}^{(j)} \end{equation}\]

Let’s re-write the covariance using indicators: \[\begin{equation} \mathrm{Cov}(X_i,X_j) = E[X_i X_j] - E[X_i]E[X_j] \end{equation}\] Let’s compute the first term: \[\begin{eqnarray} E[X_i X_j] &=& E\bigg[(\sum_{k=1}^{r}I_{k}^{(i)}) (\sum_{l=1}^{r}I_{l}^{(j)})\bigg] = \sum_{k=l}E\big[I_{k}^{(i)}I_{l}^{(j)}\big] + \sum_{k\neq l}E\big[I_{k}^{(i)}I_{l}^{(j)}\big] = \\ &=& 0 + \sum_{k\neq l}E\big[I_{k}^{(i)}\big] E\big[I_{l}^{(j)}\big] = \sum_{k\neq l} p_i p_j = (r^2 - r)p_i p_j \end{eqnarray}\] where we expanded the product of sums, used linearity of expectation and the fact that when \(k=l\) we can’t simultaneously roll \(i\) and \(j\) on the same trial \(k=l\) (making the product of indicators zero) Finally we applied independence of rolls that enabled us to write it as a product of probabilities. Let’s compute the remaining term: \[\begin{equation} E[X_i] = E[\sum_{k=1}^{r}I_{k}^{(i)}] = \sum_{k=1}^{r}E[I_{k}^{(i)}] = rp_i \end{equation}\] Therefore, the covariance equals: \[\begin{equation} \mathrm{Cov}(X_i,X_j) = E[X_i X_j] - E[X_i]E[X_j] = (r^2-r)p_ip_j - r^2p_ip_j = -r p_i p_j \end{equation}\] Notice that \(\mathrm{Cov}(X_i, X_j) = -r p_i p_j < 0\) is negative, this makes sense intuitively since for a fixed number of rolls \(r\), if we roll many outcomes \(i\), this reduces the number of possible outcomes \(j\), and therefore \(X_i\) and \(X_j\) are negatively correlated! reference

Cars
## # A tibble: 18 × 4
##    sex   age   response       frequency
##    <chr> <chr> <chr>              <dbl>
##  1 women 18-23 no/little             26
##  2 women 18-23 important             12
##  3 women 18-23 very important         7
##  4 women 24-40 no/little              9
##  5 women 24-40 important             21
##  6 women 24-40 very important        15
##  7 women > 40  no/little              5
##  8 women > 40  important             14
##  9 women > 40  very important        41
## 10 men   18-23 no/little             40
## 11 men   18-23 important             17
## 12 men   18-23 very important         8
## 13 men   24-40 no/little             17
## 14 men   24-40 important             15
## 15 men   24-40 very important        12
## 16 men   > 40  no/little              8
## 17 men   > 40  important             15
## 18 men   > 40  very important        18
#Figure 8.1
library(tidyverse)
library(ggprism)
Cars |> group_by(sex, age) |> mutate(
    age = factor(age, levels = c("18-23", "24-40", "> 40")),
    proportion = frequency/sum(frequency)
  ) |> ggplot(aes(x = age, y = proportion, group = response)) +
  geom_point(size=2) +
  geom_line(aes(color = response, linetype = response), size=1) +
  facet_wrap(~sex) + theme_bw() + theme(
  # Hide panel borders and remove grid lines
  #panel.border = element_line(colour = "black"),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  # Change axis line
  #axis.line = element_line(colour = "black")
) + scale_y_continuous(limits = c(0, 0.8), expand = c(0, 0)) +
  theme_prism(palette = "black_and_white", base_size = 16) +
  scale_colour_prism(palette = "colorblind_safe")

factor(Cars$age)
##  [1] 18-23 18-23 18-23 24-40 24-40 24-40 > 40  > 40  > 40  18-23 18-23 18-23
## [13] 24-40 24-40 24-40 > 40  > 40  > 40 
## Levels: > 40 18-23 24-40
factor(Cars$sex)
##  [1] women women women women women women women women women men   men   men  
## [13] men   men   men   men   men   men  
## Levels: men women
factor(Cars$response)
##  [1] no/little      important      very important no/little      important     
##  [6] very important no/little      important      very important no/little     
## [11] important      very important no/little      important      very important
## [16] no/little      important      very important
## Levels: important no/little very important
library(nnet)
Cars2 <- Cars |> group_by(sex, age) |> mutate(
    age = factor(age, levels = c("18-23", "24-40", "> 40")),
    sex = factor(sex, levels = c("women", "men")),
    response = factor(response, levels = c("no/little", "important", "very important"))
)

res.cars=multinom(response~age+sex, weights=frequency,data=Cars2)
## # weights:  15 (8 variable)
## initial  value 329.583687 
## iter  10 value 290.490920
## final  value 290.351098 
## converged
summary(res.cars)
## Call:
## multinom(formula = response ~ age + sex, data = Cars2, weights = frequency)
## 
## Coefficients:
##                (Intercept) age24-40  age> 40     sexmen
## important       -0.5907992 1.128268 1.587709 -0.3881301
## very important  -1.0390726 1.478104 2.916757 -0.8130202
## 
## Std. Errors:
##                (Intercept)  age24-40   age> 40    sexmen
## important        0.2839756 0.3416449 0.4028997 0.3005115
## very important   0.3305014 0.4009256 0.4229276 0.3210382
## 
## Residual Deviance: 580.7022 
## AIC: 596.7022
summary(res.cars)$coefficients
##                (Intercept) age24-40  age> 40     sexmen
## important       -0.5907992 1.128268 1.587709 -0.3881301
## very important  -1.0390726 1.478104 2.916757 -0.8130202
summary(res.cars)$standard.errors
##                (Intercept)  age24-40   age> 40    sexmen
## important        0.2839756 0.3416449 0.4028997 0.3005115
## very important   0.3305014 0.4009256 0.4229276 0.3210382
#Odds ratio
exp(summary(res.cars)$coefficients)
##                (Intercept) age24-40   age> 40    sexmen
## important        0.5538844 3.090300  4.892529 0.6783241
## very important   0.3537826 4.384626 18.481251 0.4435165
#95% CI
exp(summary(res.cars)$coefficients+qnorm(0.025)*summary(res.cars)$standard.errors)
##                (Intercept) age24-40  age> 40    sexmen
## important        0.3174671 1.581947 2.221184 0.3763929
## very important   0.1851029 1.998316 8.067421 0.2363968
exp(summary(res.cars)$coefficients+qnorm(0.975)*summary(res.cars)$standard.errors)
##                (Intercept) age24-40  age> 40    sexmen
## important        0.9663615 6.036836 10.77661 1.2224555
## very important   0.6761761 9.620575 42.33777 0.8321049
#(95% confidence interval)
exp(summary(res.cars)$coefficients - summary(res.cars)$standard.errors * 1.96)
##                (Intercept) age24-40  age> 40    sexmen
## important        0.3174638 1.581928 2.221152 0.3763888
## very important   0.1851007 1.998287 8.067298 0.2363941
exp(summary(res.cars)$coefficients + summary(res.cars)$standard.errors * 1.96)
##                (Intercept) age24-40  age> 40    sexmen
## important        0.9663714 6.036910 10.77677 1.2224687
## very important   0.6761841 9.620714 42.33842 0.8321145
logLik(res.cars)
## 'log Lik.' -290.3511 (df=8)
#minimal model
res.cars.mini=multinom(response~1, weights=frequency,data=Cars2)
## # weights:  6 (2 variable)
## initial  value 329.583687 
## final  value 329.272024 
## converged
summary(res.cars.mini)
## Call:
## multinom(formula = response ~ 1, data = Cars2, weights = frequency)
## 
## Coefficients:
##                (Intercept)
## important      -0.11066559
## very important -0.03883986
## 
## Std. Errors:
##                (Intercept)
## important        0.1419933
## very important   0.1393729
## 
## Residual Deviance: 658.544 
## AIC: 662.544
logLik(res.cars.mini)
## 'log Lik.' -329.272 (df=2)
# the likelihood ratio chi-squared statistic C
2*(c(logLik(res.cars) - logLik(res.cars.mini)))
## [1] 77.84185
#degrees of freedom
8-2
## [1] 6
#p value with chisq(6)
1-pchisq(2*(c(logLik(res.cars) - logLik(res.cars.mini))), df=6)
## [1] 9.992007e-15
#pseudo R^2

c((logLik(res.cars.mini) - logLik(res.cars))/logLik(res.cars.mini))
## [1] 0.118203
#AIC
res.cars$AIC
## [1] 596.7022
-2* logLik(res.cars)+2*res.cars$edf
## 'log Lik.' 596.7022 (df=8)
attr(logLik(res.cars), "df")
## [1] 8
#coefficients
coef(res.cars)
##                (Intercept) age24-40  age> 40     sexmen
## important       -0.5907992 1.128268 1.587709 -0.3881301
## very important  -1.0390726 1.478104 2.916757 -0.8130202
#fitted values
fitted(res.cars)
##     no/little important very important
## 1  0.52420049 0.2903465      0.1854530
## 2  0.52420049 0.2903465      0.1854530
## 3  0.52420049 0.2903465      0.1854530
## 4  0.23458355 0.4015294      0.3638870
## 5  0.23458355 0.4015294      0.3638870
## 6  0.23458355 0.4015294      0.3638870
## 7  0.09757772 0.2644254      0.6379968
## 8  0.09757772 0.2644254      0.6379968
## 9  0.09757772 0.2644254      0.6379968
## 10 0.65247678 0.2451441      0.1023791
## 11 0.65247678 0.2451441      0.1023791
## 12 0.65247678 0.2451441      0.1023791
## 13 0.35099404 0.4075274      0.2414786
## 14 0.35099404 0.4075274      0.2414786
## 15 0.35099404 0.4075274      0.2414786
## 16 0.17427519 0.3203504      0.5053744
## 17 0.17427519 0.3203504      0.5053744
## 18 0.17427519 0.3203504      0.5053744
#Table 8.3
fitted(res.cars) |> bind_cols(Obs.freq = Cars2$frequency, 
                              sex = Cars2$sex,
                              age = Cars2$age) |> 
  group_by(sex, age) |> 
  mutate(group.total = sum(Obs.freq)) |> 
  mutate(fitted.no.little = `no/little`*group.total,
         fitted.important = important*group.total,
         fitted.very.important = `very important`*group.total,
         residual.no.little = (Obs.freq-fitted.no.little)/sqrt(fitted.no.little),
         residual.important = (Obs.freq-fitted.important)/sqrt(fitted.important),
         residual.very.important = (Obs.freq-fitted.very.important)/sqrt(fitted.very.important)
         ) |> as.data.frame()
##     no/little important very important Obs.freq   sex   age group.total
## 1  0.52420049 0.2903465      0.1854530       26 women 18-23          45
## 2  0.52420049 0.2903465      0.1854530       12 women 18-23          45
## 3  0.52420049 0.2903465      0.1854530        7 women 18-23          45
## 4  0.23458355 0.4015294      0.3638870        9 women 24-40          45
## 5  0.23458355 0.4015294      0.3638870       21 women 24-40          45
## 6  0.23458355 0.4015294      0.3638870       15 women 24-40          45
## 7  0.09757772 0.2644254      0.6379968        5 women  > 40          60
## 8  0.09757772 0.2644254      0.6379968       14 women  > 40          60
## 9  0.09757772 0.2644254      0.6379968       41 women  > 40          60
## 10 0.65247678 0.2451441      0.1023791       40   men 18-23          65
## 11 0.65247678 0.2451441      0.1023791       17   men 18-23          65
## 12 0.65247678 0.2451441      0.1023791        8   men 18-23          65
## 13 0.35099404 0.4075274      0.2414786       17   men 24-40          44
## 14 0.35099404 0.4075274      0.2414786       15   men 24-40          44
## 15 0.35099404 0.4075274      0.2414786       12   men 24-40          44
## 16 0.17427519 0.3203504      0.5053744        8   men  > 40          41
## 17 0.17427519 0.3203504      0.5053744       15   men  > 40          41
## 18 0.17427519 0.3203504      0.5053744       18   men  > 40          41
##    fitted.no.little fitted.important fitted.very.important residual.no.little
## 1         23.589022         13.06559              8.345386          0.4964074
## 2         23.589022         13.06559              8.345386         -2.3861175
## 3         23.589022         13.06559              8.345386         -3.4155907
## 4         10.556260         18.06882             16.374916         -0.4789906
## 5         10.556260         18.06882             16.374916          3.2144083
## 6         10.556260         18.06882             16.374916          1.3677088
## 7          5.854663         15.86553             38.279811         -0.3532191
## 8          5.854663         15.86553             38.279811          3.3663407
## 9          5.854663         15.86553             38.279811         14.5250200
## 10        42.410991         15.93437              6.654642         -0.3702170
## 11        42.410991         15.93437              6.654642         -3.9019562
## 12        42.410991         15.93437              6.654642         -5.2839411
## 13        15.443738         17.93121             10.625057          0.3960104
## 14        15.443738         17.93121             10.625057         -0.1129146
## 15        15.443738         17.93121             10.625057         -0.8763021
## 16         7.145283         13.13437             20.720349          0.3197517
## 17         7.145283         13.13437             20.720349          2.9384673
## 18         7.145283         13.13437             20.720349          4.0607740
##    residual.important residual.very.important
## 1           3.5783433               6.1113199
## 2          -0.2947993               1.2650810
## 3          -1.6780645              -0.4657187
## 4          -2.1334675              -1.8225001
## 5           0.6895678               1.1429574
## 6          -0.7219499              -0.3397713
## 7          -2.7278690              -5.3789308
## 8          -0.4683537              -3.9242838
## 9           6.3101920               0.4396572
## 10          6.0287861              12.9262641
## 11          0.2669563               4.0103582
## 12         -1.9876727               0.5215254
## 13         -0.2199079               1.9557381
## 14         -0.6922158               1.3421677
## 15         -1.4006778               0.4218122
## 16         -1.4167148              -2.7944767
## 17          0.5147796              -1.2566780
## 18          1.3425629              -0.5976214
#Table 8.3
options(paged.print = FALSE,
        pillar.print_max = 25, 
        pillar.print_min = 25)
fitted(res.cars) |> as_tibble() |> distinct(.keep_all = TRUE)|> pivot_longer(
    cols = c('no/little', 'important', 'very important'), 
    names_to = "Importance.Rating", 
    values_to = "Estimated.probability"
  ) |> bind_cols(Obs.freq = Cars2$frequency, 
                              sex = Cars2$sex,
                              age = Cars2$age) |> 
  group_by(sex, age) |> 
  mutate(group.total = sum(Obs.freq)) |> 
  mutate(fitted.value = Estimated.probability*group.total,
         Pearson.residual = (Obs.freq-fitted.value)/sqrt(fitted.value)
         )
## # A tibble: 18 × 8
## # Groups:   sex, age [6]
##    Importance.Rating Estimated.probability Obs.freq sex   age   group.total
##    <chr>                             <dbl>    <dbl> <fct> <fct>       <dbl>
##  1 no/little                        0.524        26 women 18-23          45
##  2 important                        0.290        12 women 18-23          45
##  3 very important                   0.185         7 women 18-23          45
##  4 no/little                        0.235         9 women 24-40          45
##  5 important                        0.402        21 women 24-40          45
##  6 very important                   0.364        15 women 24-40          45
##  7 no/little                        0.0976        5 women > 40           60
##  8 important                        0.264        14 women > 40           60
##  9 very important                   0.638        41 women > 40           60
## 10 no/little                        0.652        40 men   18-23          65
## 11 important                        0.245        17 men   18-23          65
## 12 very important                   0.102         8 men   18-23          65
## 13 no/little                        0.351        17 men   24-40          44
## 14 important                        0.408        15 men   24-40          44
## 15 very important                   0.241        12 men   24-40          44
## 16 no/little                        0.174         8 men   > 40           41
## 17 important                        0.320        15 men   > 40           41
## 18 very important                   0.505        18 men   > 40           41
## # ℹ 2 more variables: fitted.value <dbl>, Pearson.residual <dbl>
sum.squared.residuals <- fitted(res.cars) |> as_tibble() |> distinct(.keep_all = TRUE)|> pivot_longer(
    cols = c('no/little', 'important', 'very important'), 
    names_to = "Importance.Rating", 
    values_to = "Estimated.probability"
  ) |> bind_cols(Obs.freq = Cars2$frequency, 
                              sex = Cars2$sex,
                              age = Cars2$age) |> 
  group_by(sex, age) |> 
  mutate(group.total = sum(Obs.freq)) |> 
  mutate(fitted.value = Estimated.probability*group.total,
         Pearson.residual = (Obs.freq-fitted.value)/sqrt(fitted.value)
         ) |> ungroup() |> 
  summarize(sum(Pearson.residual^2))
sum.squared.residuals
## # A tibble: 1 × 1
##   `sum(Pearson.residual^2)`
##                       <dbl>
## 1                      3.93
res.cars.max=multinom(response~age+sex+age*sex, weights=frequency,data=Cars2)
## # weights:  21 (12 variable)
## initial  value 329.583687 
## iter  10 value 289.028791
## final  value 288.381742 
## converged
res.cars.max
## Call:
## multinom(formula = response ~ age + sex + age * sex, data = Cars2, 
##     weights = frequency)
## 
## Coefficients:
##                (Intercept) age24-40  age> 40      sexmen age24-40:sexmen
## important       -0.7731881 1.620480 1.802883 -0.08249128      -0.8899378
## very important  -1.3121701 1.822993 3.416376 -0.29728385      -0.5618162
##                age> 40:sexmen
## important          -0.3185948
## very important     -0.9959994
## 
## Residual Deviance: 576.7635 
## AIC: 600.7635
summary(res.cars.max)
## Call:
## multinom(formula = response ~ age + sex + age * sex, data = Cars2, 
##     weights = frequency)
## 
## Coefficients:
##                (Intercept) age24-40  age> 40      sexmen age24-40:sexmen
## important       -0.7731881 1.620480 1.802883 -0.08249128      -0.8899378
## very important  -1.3121701 1.822993 3.416376 -0.29728385      -0.5618162
##                age> 40:sexmen
## important          -0.3185948
## very important     -0.9959994
## 
## Std. Errors:
##                (Intercept)  age24-40   age> 40    sexmen age24-40:sexmen
## important        0.3489915 0.5296461 0.6270856 0.4534522       0.6998853
## very important   0.4258132 0.5992447 0.6369618 0.5756024       0.8070020
##                age> 40:sexmen
## important           0.8177568
## very important      0.8580658
## 
## Residual Deviance: 576.7635 
## AIC: 600.7635
logLik(res.cars.max)
## 'log Lik.' -288.3817 (df=12)
attr(logLik(res.cars.max), "df")
## [1] 12
#the deviance
2*(logLik(res.cars.max) - logLik(res.cars))
## 'log Lik.' 3.938713 (df=12)
#The degrees of freedom associated with this deviance
attr(logLik(res.cars.max), "df") - attr(logLik(res.cars), "df")
## [1] 4
difference <- 2*(logLik(res.cars.max) - logLik(res.cars))
difference
## 'log Lik.' 3.938713 (df=12)
1-pchisq(difference,df=4)
## 'log Lik.' 0.4143637 (df=12)
library(MASS)
res.polr=polr(response~age+sex, weights=frequency,data=Cars2)
res.polr
## Call:
## polr(formula = response ~ age + sex, data = Cars2, weights = frequency)
## 
## Coefficients:
##   age24-40    age> 40     sexmen 
##  1.1470997  2.2324597 -0.5762262 
## 
## Intercepts:
##      no/little|important important|very important 
##               0.04353714               1.65497448 
## 
## Residual Deviance: 581.2956 
## AIC: 591.2956
logLik(res.polr)
## 'log Lik.' -290.6478 (df=5)
summary(res.polr)
## Call:
## polr(formula = response ~ age + sex, data = Cars2, weights = frequency)
## 
## Coefficients:
##            Value Std. Error t value
## age24-40  1.1471     0.2776   4.132
## age> 40   2.2325     0.2915   7.659
## sexmen   -0.5762     0.2262  -2.548
## 
## Intercepts:
##                          Value   Std. Error t value
## no/little|important       0.0435  0.2323     0.1874
## important|very important  1.6550  0.2556     6.4744
## 
## Residual Deviance: 581.2956 
## AIC: 591.2956
coefficients <- summary(res.polr)$coefficients
coefficients
##                                Value Std. Error   t value
## age24-40                  1.14709969  0.2776161  4.131964
## age> 40                   2.23245971  0.2914908  7.658766
## sexmen                   -0.57622616  0.2261865 -2.547571
## no/little|important       0.04353714  0.2322724  0.187440
## important|very important  1.65497448  0.2556182  6.474401

The following proportional odds model was fitted to these data:

\[\log\left(\frac{\pi_1}{\pi_2+\pi_3}\right)=\beta_{01}+\beta_{1}x_1+\beta_{2}x_2+\beta_{3}x_3\] \[\log\left(\frac{\pi_1+\pi_2}{\pi_3}\right)=\beta_{02}+\beta_{1}x_1+\beta_{2}x_2+\beta_{3}x_3\]

#Table8.4
tibble(Estimate_b= c(coefficients[4,1], #beta_01
                      coefficients[5,1], #beta_02
                      coefficients[3,1], #beta_1
                      coefficients[1,1], #beta_2
                      coefficients[2,1]),#beta_3
       Standard_error_b= c(coefficients[4,2],
                      coefficients[5,2],
                      coefficients[3,2],
                      coefficients[1,2],
                      coefficients[2,2]),
       Odds_ratio_OR=exp(Estimate_b),
       `95%_CI_low`=c(exp(Estimate_b+qnorm(0.025)*Standard_error_b)),
       `95%_CI_high`=c(exp(Estimate_b+qnorm(0.975)*Standard_error_b))
)
## # A tibble: 5 × 5
##   Estimate_b Standard_error_b Odds_ratio_OR `95%_CI_low` `95%_CI_high`
##        <dbl>            <dbl>         <dbl>        <dbl>         <dbl>
## 1     0.0435            0.232         1.04         0.663         1.65 
## 2     1.65              0.256         5.23         3.17          8.64 
## 3    -0.576             0.226         0.562        0.361         0.876
## 4     1.15              0.278         3.15         1.83          5.43 
## 5     2.23              0.291         9.32         5.27         16.5
res.polr.mini=polr(response~1, weights=frequency,data=Cars2)
summary(res.polr.mini)
## Call:
## polr(formula = response ~ 1, data = Cars2, weights = frequency)
## 
## No coefficients
## 
## Intercepts:
##                          Value   Std. Error t value
## no/little|important      -0.6190  0.1210    -5.1141
## important|very important  0.6782  0.1222     5.5510
## 
## Residual Deviance: 658.544 
## AIC: 662.544
logLik(res.polr)
## 'log Lik.' -290.6478 (df=5)
logLik(res.polr.mini)
## 'log Lik.' -329.272 (df=2)
#C
2*(logLik(res.polr)-logLik(res.polr.mini))
## 'log Lik.' 77.24849 (df=5)
#R^2
(logLik(res.polr.mini)-logLik(res.polr))/logLik(res.polr.mini)
## 'log Lik.' 0.1173019 (df=2)
#AIC
-2*logLik(res.polr)+2*attr(logLik(res.polr), "df")
## 'log Lik.' 591.2956 (df=5)
res.polr$deviance
## [1] 581.2956
fitted(res.polr)
##    no/little important very important
## 1  0.5108826 0.3286797     0.16043777
## 2  0.5108826 0.3286797     0.16043777
## 3  0.5108826 0.3286797     0.16043777
## 4  0.2490730 0.3752352     0.37569186
## 5  0.2490730 0.3752352     0.37569186
## 6  0.2490730 0.3752352     0.37569186
## 7  0.1007497 0.2587618     0.64048855
## 8  0.1007497 0.2587618     0.64048855
## 9  0.1007497 0.2587618     0.64048855
## 10 0.6501647 0.2528518     0.09698344
## 11 0.6501647 0.2528518     0.09698344
## 12 0.6501647 0.2528518     0.09698344
## 13 0.3711383 0.3761309     0.25273073
## 14 0.3711383 0.3761309     0.25273073
## 15 0.3711383 0.3761309     0.25273073
## 16 0.1662146 0.3334706     0.50031477
## 17 0.1662146 0.3334706     0.50031477
## 18 0.1662146 0.3334706     0.50031477
fitted(res.polr) |> as_tibble() |> distinct(.keep_all = TRUE)|> pivot_longer(
    cols = c('no/little', 'important', 'very important'), 
    names_to = "Importance.Rating", 
    values_to = "Estimated.probability"
  ) |> bind_cols(Obs.freq = Cars2$frequency, 
                              sex = Cars2$sex,
                              age = Cars2$age) |> 
  group_by(sex, age) |> 
  mutate(group.total = sum(Obs.freq)) |> 
  mutate(fitted.value = Estimated.probability*group.total,
         Pearson.residual = (Obs.freq-fitted.value)/sqrt(fitted.value)
         )
## # A tibble: 18 × 8
## # Groups:   sex, age [6]
##    Importance.Rating Estimated.probability Obs.freq sex   age   group.total
##    <chr>                             <dbl>    <dbl> <fct> <fct>       <dbl>
##  1 no/little                        0.511        26 women 18-23          45
##  2 important                        0.329        12 women 18-23          45
##  3 very important                   0.160         7 women 18-23          45
##  4 no/little                        0.249         9 women 24-40          45
##  5 important                        0.375        21 women 24-40          45
##  6 very important                   0.376        15 women 24-40          45
##  7 no/little                        0.101         5 women > 40           60
##  8 important                        0.259        14 women > 40           60
##  9 very important                   0.640        41 women > 40           60
## 10 no/little                        0.650        40 men   18-23          65
## 11 important                        0.253        17 men   18-23          65
## 12 very important                   0.0970        8 men   18-23          65
## 13 no/little                        0.371        17 men   24-40          44
## 14 important                        0.376        15 men   24-40          44
## 15 very important                   0.253        12 men   24-40          44
## 16 no/little                        0.166         8 men   > 40           41
## 17 important                        0.333        15 men   > 40           41
## 18 very important                   0.500        18 men   > 40           41
## # ℹ 2 more variables: fitted.value <dbl>, Pearson.residual <dbl>
sum.squared.residuals <- fitted(res.polr) |> as_tibble() |> distinct(.keep_all = TRUE)|> pivot_longer(
    cols = c('no/little', 'important', 'very important'), 
    names_to = "Importance.Rating", 
    values_to = "Estimated.probability"
  ) |> bind_cols(Obs.freq = Cars2$frequency, 
                              sex = Cars2$sex,
                              age = Cars2$age) |> 
  group_by(sex, age) |> 
  mutate(group.total = sum(Obs.freq)) |> 
  mutate(fitted.value = Estimated.probability*group.total,
         Pearson.residual = (Obs.freq-fitted.value)/sqrt(fitted.value)
         ) |> ungroup() |> 
  summarize(sum(Pearson.residual^2))
sum.squared.residuals
## # A tibble: 1 × 1
##   `sum(Pearson.residual^2)`
##                       <dbl>
## 1                      4.56
1-pchisq(4.56, df=7)
## [1] 0.7134807

Exercises

8.1

If there are only \(J = 2\) response categories, show that models Suppose the first category is the reference category, the logits for the other categories are defined by \[logit(\pi_i)=\log(\pi_j/\pi_1)=\mathbf x_j^T\boldsymbol\beta_j,\quad \text{for}\quad j=2,\dots,J.\tag{8.4}\] The cumulative logit model is \[\log\left(\frac{\pi_1+\cdots+\pi_j}{\pi_{j+1}+\cdots+\pi_J}\right)=\mathbf x_j^T\boldsymbol\beta_j\tag{8.13}\] The adjacent category logit model is \[\log\left(\frac{\pi_j}{\pi_{j+1}}\right)=\mathbf x_j^T\boldsymbol\beta_j\tag{8.15}\] and Continuation ratio logit model \[\log\left(\frac{\pi_j}{\pi_{j+1}+\cdots+\pi_{J}}\right)=\mathbf x_j^T\boldsymbol\beta_j\tag{8.16}\] all reduce to the logistic regression model for binary data.

\[\log\left(\frac{\pi}{1-\pi}\right)=\mathbf x_j^T\boldsymbol\beta_j\]

#Exercises 8.2
housing
##       Sat   Infl      Type Cont Freq
## 1     Low    Low     Tower  Low   21
## 2  Medium    Low     Tower  Low   21
## 3    High    Low     Tower  Low   28
## 4     Low Medium     Tower  Low   34
## 5  Medium Medium     Tower  Low   22
## 6    High Medium     Tower  Low   36
## 7     Low   High     Tower  Low   10
## 8  Medium   High     Tower  Low   11
## 9    High   High     Tower  Low   36
## 10    Low    Low Apartment  Low   61
## 11 Medium    Low Apartment  Low   23
## 12   High    Low Apartment  Low   17
## 13    Low Medium Apartment  Low   43
## 14 Medium Medium Apartment  Low   35
## 15   High Medium Apartment  Low   40
## 16    Low   High Apartment  Low   26
## 17 Medium   High Apartment  Low   18
## 18   High   High Apartment  Low   54
## 19    Low    Low    Atrium  Low   13
## 20 Medium    Low    Atrium  Low    9
## 21   High    Low    Atrium  Low   10
## 22    Low Medium    Atrium  Low    8
## 23 Medium Medium    Atrium  Low    8
## 24   High Medium    Atrium  Low   12
## 25    Low   High    Atrium  Low    6
## 26 Medium   High    Atrium  Low    7
## 27   High   High    Atrium  Low    9
## 28    Low    Low   Terrace  Low   18
## 29 Medium    Low   Terrace  Low    6
## 30   High    Low   Terrace  Low    7
## 31    Low Medium   Terrace  Low   15
## 32 Medium Medium   Terrace  Low   13
## 33   High Medium   Terrace  Low   13
## 34    Low   High   Terrace  Low    7
## 35 Medium   High   Terrace  Low    5
## 36   High   High   Terrace  Low   11
## 37    Low    Low     Tower High   14
## 38 Medium    Low     Tower High   19
## 39   High    Low     Tower High   37
## 40    Low Medium     Tower High   17
## 41 Medium Medium     Tower High   23
## 42   High Medium     Tower High   40
## 43    Low   High     Tower High    3
## 44 Medium   High     Tower High    5
## 45   High   High     Tower High   23
## 46    Low    Low Apartment High   78
## 47 Medium    Low Apartment High   46
## 48   High    Low Apartment High   43
## 49    Low Medium Apartment High   48
## 50 Medium Medium Apartment High   45
## 51   High Medium Apartment High   86
## 52    Low   High Apartment High   15
## 53 Medium   High Apartment High   25
## 54   High   High Apartment High   62
## 55    Low    Low    Atrium High   20
## 56 Medium    Low    Atrium High   23
## 57   High    Low    Atrium High   20
## 58    Low Medium    Atrium High   10
## 59 Medium Medium    Atrium High   22
## 60   High Medium    Atrium High   24
## 61    Low   High    Atrium High    7
## 62 Medium   High    Atrium High   10
## 63   High   High    Atrium High   21
## 64    Low    Low   Terrace High   57
## 65 Medium    Low   Terrace High   23
## 66   High    Low   Terrace High   13
## 67    Low Medium   Terrace High   31
## 68 Medium Medium   Terrace High   21
## 69   High Medium   Terrace High   13
## 70    Low   High   Terrace High    5
## 71 Medium   High   Terrace High    6
## 72   High   High   Terrace High   13
housing$Sat2 <- factor(housing$Sat, levels = c("Low","Medium","High"))
housing$Type2 <- factor(housing$Type, levels = c("Tower","Apartment","Atrium","Terrace"))
housing$Cont2 <- factor(housing$Cont, levels = c("Low","High"))
housing
##       Sat   Infl      Type Cont Freq   Sat2     Type2 Cont2
## 1     Low    Low     Tower  Low   21    Low     Tower   Low
## 2  Medium    Low     Tower  Low   21 Medium     Tower   Low
## 3    High    Low     Tower  Low   28   High     Tower   Low
## 4     Low Medium     Tower  Low   34    Low     Tower   Low
## 5  Medium Medium     Tower  Low   22 Medium     Tower   Low
## 6    High Medium     Tower  Low   36   High     Tower   Low
## 7     Low   High     Tower  Low   10    Low     Tower   Low
## 8  Medium   High     Tower  Low   11 Medium     Tower   Low
## 9    High   High     Tower  Low   36   High     Tower   Low
## 10    Low    Low Apartment  Low   61    Low Apartment   Low
## 11 Medium    Low Apartment  Low   23 Medium Apartment   Low
## 12   High    Low Apartment  Low   17   High Apartment   Low
## 13    Low Medium Apartment  Low   43    Low Apartment   Low
## 14 Medium Medium Apartment  Low   35 Medium Apartment   Low
## 15   High Medium Apartment  Low   40   High Apartment   Low
## 16    Low   High Apartment  Low   26    Low Apartment   Low
## 17 Medium   High Apartment  Low   18 Medium Apartment   Low
## 18   High   High Apartment  Low   54   High Apartment   Low
## 19    Low    Low    Atrium  Low   13    Low    Atrium   Low
## 20 Medium    Low    Atrium  Low    9 Medium    Atrium   Low
## 21   High    Low    Atrium  Low   10   High    Atrium   Low
## 22    Low Medium    Atrium  Low    8    Low    Atrium   Low
## 23 Medium Medium    Atrium  Low    8 Medium    Atrium   Low
## 24   High Medium    Atrium  Low   12   High    Atrium   Low
## 25    Low   High    Atrium  Low    6    Low    Atrium   Low
## 26 Medium   High    Atrium  Low    7 Medium    Atrium   Low
## 27   High   High    Atrium  Low    9   High    Atrium   Low
## 28    Low    Low   Terrace  Low   18    Low   Terrace   Low
## 29 Medium    Low   Terrace  Low    6 Medium   Terrace   Low
## 30   High    Low   Terrace  Low    7   High   Terrace   Low
## 31    Low Medium   Terrace  Low   15    Low   Terrace   Low
## 32 Medium Medium   Terrace  Low   13 Medium   Terrace   Low
## 33   High Medium   Terrace  Low   13   High   Terrace   Low
## 34    Low   High   Terrace  Low    7    Low   Terrace   Low
## 35 Medium   High   Terrace  Low    5 Medium   Terrace   Low
## 36   High   High   Terrace  Low   11   High   Terrace   Low
## 37    Low    Low     Tower High   14    Low     Tower  High
## 38 Medium    Low     Tower High   19 Medium     Tower  High
## 39   High    Low     Tower High   37   High     Tower  High
## 40    Low Medium     Tower High   17    Low     Tower  High
## 41 Medium Medium     Tower High   23 Medium     Tower  High
## 42   High Medium     Tower High   40   High     Tower  High
## 43    Low   High     Tower High    3    Low     Tower  High
## 44 Medium   High     Tower High    5 Medium     Tower  High
## 45   High   High     Tower High   23   High     Tower  High
## 46    Low    Low Apartment High   78    Low Apartment  High
## 47 Medium    Low Apartment High   46 Medium Apartment  High
## 48   High    Low Apartment High   43   High Apartment  High
## 49    Low Medium Apartment High   48    Low Apartment  High
## 50 Medium Medium Apartment High   45 Medium Apartment  High
## 51   High Medium Apartment High   86   High Apartment  High
## 52    Low   High Apartment High   15    Low Apartment  High
## 53 Medium   High Apartment High   25 Medium Apartment  High
## 54   High   High Apartment High   62   High Apartment  High
## 55    Low    Low    Atrium High   20    Low    Atrium  High
## 56 Medium    Low    Atrium High   23 Medium    Atrium  High
## 57   High    Low    Atrium High   20   High    Atrium  High
## 58    Low Medium    Atrium High   10    Low    Atrium  High
## 59 Medium Medium    Atrium High   22 Medium    Atrium  High
## 60   High Medium    Atrium High   24   High    Atrium  High
## 61    Low   High    Atrium High    7    Low    Atrium  High
## 62 Medium   High    Atrium High   10 Medium    Atrium  High
## 63   High   High    Atrium High   21   High    Atrium  High
## 64    Low    Low   Terrace High   57    Low   Terrace  High
## 65 Medium    Low   Terrace High   23 Medium   Terrace  High
## 66   High    Low   Terrace High   13   High   Terrace  High
## 67    Low Medium   Terrace High   31    Low   Terrace  High
## 68 Medium Medium   Terrace High   21 Medium   Terrace  High
## 69   High Medium   Terrace High   13   High   Terrace  High
## 70    Low   High   Terrace High    5    Low   Terrace  High
## 71 Medium   High   Terrace High    6 Medium   Terrace  High
## 72   High   High   Terrace High   13   High   Terrace  High
library(nnet)
#levels of satisfaction and type of housing
res.housing.1=multinom(Sat2~Type2, weights=Freq,data=housing)
## # weights:  15 (8 variable)
## initial  value 1846.767257 
## iter  10 value 1794.655475
## final  value 1794.104446 
## converged
summary(res.housing.1)
## Call:
## multinom(formula = Sat2 ~ Type2, data = housing, weights = Freq)
## 
## Coefficients:
##        (Intercept) Type2Apartment Type2Atrium Type2Terrace
## Medium  0.02000502     -0.3646237   0.1905561   -0.6062867
## High    0.70320372     -0.5948949  -0.2977434   -1.3450483
## 
## Std. Errors:
##        (Intercept) Type2Apartment Type2Atrium Type2Terrace
## Medium   0.1414286      0.1700012   0.2197390    0.2025695
## High     0.1228863      0.1486686   0.2028366    0.1921080
## 
## Residual Deviance: 3588.209 
## AIC: 3604.209
#levels of satisfaction and type of contact
res.housing.2=multinom(Sat2~Cont2, weights=Freq,data=housing)
## # weights:  9 (4 variable)
## initial  value 1846.767257 
## final  value 1821.875901 
## converged
summary(res.housing.2)
## Call:
## multinom(formula = Sat2 ~ Cont2, data = housing, weights = Freq)
## 
## Coefficients:
##        (Intercept) Cont2High
## Medium -0.38656248 0.2572377
## High    0.04113022 0.2174434
## 
## Std. Errors:
##        (Intercept) Cont2High
## Medium  0.09713283 0.1282373
## High    0.08648579 0.1152828
## 
## Residual Deviance: 3643.752 
## AIC: 3651.752
#levels of satisfaction and type of housing and contact
res.housing.3=multinom(Sat2~Type2+Cont2, weights=Freq,data=housing)
## # weights:  18 (10 variable)
## initial  value 1846.767257 
## iter  10 value 1793.932058
## final  value 1789.600661 
## converged
summary(res.housing.3)
## Call:
## multinom(formula = Sat2 ~ Type2 + Cont2, data = housing, weights = Freq)
## 
## Coefficients:
##        (Intercept) Type2Apartment Type2Atrium Type2Terrace Cont2High
## Medium  -0.1091063      -0.407446   0.1278116   -0.6738718 0.3005283
## High     0.5586042      -0.642400  -0.3672630   -1.4199239 0.3334568
## 
## Std. Errors:
##        (Intercept) Type2Apartment Type2Atrium Type2Terrace Cont2High
## Medium   0.1524817      0.1713221   0.2217222    0.2051505 0.1306991
## High     0.1330480      0.1501078   0.2048673    0.1947044 0.1190333
## 
## Residual Deviance: 3579.201 
## AIC: 3599.201
res.housing.3$edf
## [1] 10
#Exercise 8.3
tumor
## # A tibble: 16 × 4
##    treatment   sex    response           frequency
##    <chr>       <chr>  <chr>                  <dbl>
##  1 sequential  male   progressive               28
##  2 sequential  male   no change                 45
##  3 sequential  male   partial remission         29
##  4 sequential  male   complete remission        26
##  5 sequential  female progressive                4
##  6 sequential  female no change                 12
##  7 sequential  female partial remission          5
##  8 sequential  female complete remission         2
##  9 alternating male   progressive               41
## 10 alternating male   no change                 44
## 11 alternating male   partial remission         20
## 12 alternating male   complete remission        20
## 13 alternating female progressive               12
## 14 alternating female no change                  7
## 15 alternating female partial remission          3
## 16 alternating female complete remission         1
tumor$treatment2 <- factor(tumor$treatment, levels = c("sequential","alternating"))
tumor$sex2 <- factor(tumor$sex, levels = c("male","female"))
tumor$response2 <- factor(tumor$response, levels = c("progressive","no change","partial remission","complete remission"))
tumor
## # A tibble: 16 × 7
##    treatment   sex    response           frequency treatment2  sex2   response2 
##    <chr>       <chr>  <chr>                  <dbl> <fct>       <fct>  <fct>     
##  1 sequential  male   progressive               28 sequential  male   progressi…
##  2 sequential  male   no change                 45 sequential  male   no change 
##  3 sequential  male   partial remission         29 sequential  male   partial r…
##  4 sequential  male   complete remission        26 sequential  male   complete …
##  5 sequential  female progressive                4 sequential  female progressi…
##  6 sequential  female no change                 12 sequential  female no change 
##  7 sequential  female partial remission          5 sequential  female partial r…
##  8 sequential  female complete remission         2 sequential  female complete …
##  9 alternating male   progressive               41 alternating male   progressi…
## 10 alternating male   no change                 44 alternating male   no change 
## 11 alternating male   partial remission         20 alternating male   partial r…
## 12 alternating male   complete remission        20 alternating male   complete …
## 13 alternating female progressive               12 alternating female progressi…
## 14 alternating female no change                  7 alternating female no change 
## 15 alternating female partial remission          3 alternating female partial r…
## 16 alternating female complete remission         1 alternating female complete …
res.tumor=multinom(response2~treatment2+sex2, weights=frequency,data=tumor)
## # weights:  16 (9 variable)
## initial  value 414.502014 
## iter  10 value 393.063983
## final  value 393.017781 
## converged
summary(res.tumor)
## Call:
## multinom(formula = response2 ~ treatment2 + sex2, data = tumor, 
##     weights = frequency)
## 
## Coefficients:
##                    (Intercept) treatment2alternating  sex2female
## no change           0.59410378            -0.6164471 -0.09022479
## partial remission   0.12100143            -0.8978741 -0.36184200
## complete remission  0.01640707            -0.7986077 -1.27828458
## 
## Std. Errors:
##                    (Intercept) treatment2alternating sex2female
## no change            0.2320622             0.2954662  0.3786300
## partial remission    0.2586785             0.3511362  0.4771684
## complete remission   0.2685967             0.3677728  0.6605406
## 
## Residual Deviance: 786.0356 
## AIC: 804.0356
res.tumor$edf
## [1] 9
coefficients <- summary(res.tumor)$coefficients
coefficients
##                    (Intercept) treatment2alternating  sex2female
## no change           0.59410378            -0.6164471 -0.09022479
## partial remission   0.12100143            -0.8978741 -0.36184200
## complete remission  0.01640707            -0.7986077 -1.27828458
#alternating vs. sequential
exp(coefficients[,2])
##          no change  partial remission complete remission 
##          0.5398591          0.4074349          0.4499550
#Wald statistic
coefficients
##                    (Intercept) treatment2alternating  sex2female
## no change           0.59410378            -0.6164471 -0.09022479
## partial remission   0.12100143            -0.8978741 -0.36184200
## complete remission  0.01640707            -0.7986077 -1.27828458
coefficients/summary(res.tumor)$standard.errors
##                    (Intercept) treatment2alternating sex2female
## no change           2.56010567             -2.086354 -0.2382928
## partial remission   0.46776758             -2.557054 -0.7583110
## complete remission  0.06108443             -2.171470 -1.9352097
fitted(res.tumor)
##    progressive no change partial remission complete remission
## 1    0.2017522 0.3654553         0.2277029         0.20508966
## 2    0.2017522 0.3654553         0.2277029         0.20508966
## 3    0.2017522 0.3654553         0.2277029         0.20508966
## 4    0.2017522 0.3654553         0.2277029         0.20508966
## 5    0.2685127 0.4444233         0.2110421         0.07602186
## 6    0.2685127 0.4444233         0.2110421         0.07602186
## 7    0.2685127 0.4444233         0.2110421         0.07602186
## 8    0.2685127 0.4444233         0.2110421         0.07602186
## 9    0.3454059 0.3377739         0.1588321         0.15798808
## 10   0.3454059 0.3377739         0.1588321         0.15798808
## 11   0.3454059 0.3377739         0.1588321         0.15798808
## 12   0.3454059 0.3377739         0.1588321         0.15798808
## 13   0.4271389 0.3816642         0.1367828         0.05441413
## 14   0.4271389 0.3816642         0.1367828         0.05441413
## 15   0.4271389 0.3816642         0.1367828         0.05441413
## 16   0.4271389 0.3816642         0.1367828         0.05441413
fitted(res.tumor) |> as_tibble() |> distinct(.keep_all = TRUE)|> pivot_longer(
    cols = c('progressive', 'no change', 'partial remission', 'complete remission'), 
    names_to = "stage", 
    values_to = "Estimated.probability"
  ) |> bind_cols(Obs.freq = tumor$frequency, 
                              sex = tumor$sex2,
                              treatment = tumor$treatment2) |> 
  group_by(sex, treatment) |> 
  mutate(group.total = sum(Obs.freq)) |> 
  mutate(fitted.value = Estimated.probability*group.total,
         Pearson.residual = (Obs.freq-fitted.value)/sqrt(fitted.value)
         )
## # A tibble: 16 × 8
## # Groups:   sex, treatment [4]
##    stage Estimated.probability Obs.freq sex   treatment group.total fitted.value
##    <chr>                 <dbl>    <dbl> <fct> <fct>           <dbl>        <dbl>
##  1 prog…                0.202        28 male  sequenti…         128        25.8 
##  2 no c…                0.365        45 male  sequenti…         128        46.8 
##  3 part…                0.228        29 male  sequenti…         128        29.1 
##  4 comp…                0.205        26 male  sequenti…         128        26.3 
##  5 prog…                0.269         4 fema… sequenti…          23         6.18
##  6 no c…                0.444        12 fema… sequenti…          23        10.2 
##  7 part…                0.211         5 fema… sequenti…          23         4.85
##  8 comp…                0.0760        2 fema… sequenti…          23         1.75
##  9 prog…                0.345        41 male  alternat…         125        43.2 
## 10 no c…                0.338        44 male  alternat…         125        42.2 
## 11 part…                0.159        20 male  alternat…         125        19.9 
## 12 comp…                0.158        20 male  alternat…         125        19.7 
## 13 prog…                0.427        12 fema… alternat…          23         9.82
## 14 no c…                0.382         7 fema… alternat…          23         8.78
## 15 part…                0.137         3 fema… alternat…          23         3.15
## 16 comp…                0.0544        1 fema… alternat…          23         1.25
## # ℹ 1 more variable: Pearson.residual <dbl>
fitted(res.tumor) |> as_tibble() |> distinct(.keep_all = TRUE)|> pivot_longer(
    cols = c('progressive', 'no change', 'partial remission', 'complete remission'), 
    names_to = "stage", 
    values_to = "Estimated.probability"
  ) |> bind_cols(Obs.freq = tumor$frequency, 
                              sex = tumor$sex2,
                              treatment = tumor$treatment2) |> 
  group_by(sex, treatment) |> 
  mutate(group.total = sum(Obs.freq)) |> 
  mutate(fitted.value = Estimated.probability*group.total,
         Pearson.residual = (Obs.freq-fitted.value)/sqrt(fitted.value)
         ) |> ungroup() |> 
  summarize(sum(Pearson.residual^2))
## # A tibble: 1 × 1
##   `sum(Pearson.residual^2)`
##                       <dbl>
## 1                      2.46
res.tumor$edf
## [1] 9
1-pchisq(2.46, df=res.tumor$edf)
## [1] 0.9819399

Chapter9 Poisson Regression and Log-Linear Models

doctors
## # A tibble: 10 × 4
##    age      smoking    deaths `person-years`
##    <chr>    <chr>       <dbl>          <dbl>
##  1 35 to 44 smoker         32          52407
##  2 45 to 54 smoker        104          43248
##  3 55 to 64 smoker        206          28612
##  4 65 to 74 smoker        186          12663
##  5 75 to 84 smoker        102           5317
##  6 35 to 44 non-smoker      2          18790
##  7 45 to 54 non-smoker     12          10673
##  8 55 to 64 non-smoker     28           5710
##  9 65 to 74 non-smoker     28           2585
## 10 75 to 84 non-smoker     31           1462
knitr::kable(doctors)
age smoking deaths person-years
35 to 44 smoker 32 52407
45 to 54 smoker 104 43248
55 to 64 smoker 206 28612
65 to 74 smoker 186 12663
75 to 84 smoker 102 5317
35 to 44 non-smoker 2 18790
45 to 54 non-smoker 12 10673
55 to 64 non-smoker 28 5710
65 to 74 non-smoker 28 2585
75 to 84 non-smoker 31 1462
pander::pander( doctors, style='rmarkdown')
age smoking deaths person-years
35 to 44 smoker 32 52407
45 to 54 smoker 104 43248
55 to 64 smoker 206 28612
65 to 74 smoker 186 12663
75 to 84 smoker 102 5317
35 to 44 non-smoker 2 18790
45 to 54 non-smoker 12 10673
55 to 64 non-smoker 28 5710
65 to 74 non-smoker 28 2585
75 to 84 non-smoker 31 1462

9.2 Poisson regression

library(tidyverse)
library(ggprism)
doctors |> ggplot(aes(x=age,
                      y=deaths*100000/`person-years`,
                      group=smoking))+geom_point(size=3, aes(shape=smoking)) +
  scale_colour_prism(palette = "colorblind_safe", 
                     labels = c("No smoking", "smoking")) +
  theme_prism(border = TRUE, 
              palette = "black_and_white", 
              base_size = 12)

doctors |> mutate(agecat = c(1:5,1:5),
              agesq = agecat^2,
              smoke = c(rep(1,5), rep(0,5)),
              smokage = c(1:5, rep(0,5)))
## # A tibble: 10 × 8
##    age      smoking    deaths `person-years` agecat agesq smoke smokage
##    <chr>    <chr>       <dbl>          <dbl>  <int> <dbl> <dbl>   <dbl>
##  1 35 to 44 smoker         32          52407      1     1     1       1
##  2 45 to 54 smoker        104          43248      2     4     1       2
##  3 55 to 64 smoker        206          28612      3     9     1       3
##  4 65 to 74 smoker        186          12663      4    16     1       4
##  5 75 to 84 smoker        102           5317      5    25     1       5
##  6 35 to 44 non-smoker      2          18790      1     1     0       0
##  7 45 to 54 non-smoker     12          10673      2     4     0       0
##  8 55 to 64 non-smoker     28           5710      3     9     0       0
##  9 65 to 74 non-smoker     28           2585      4    16     0       0
## 10 75 to 84 non-smoker     31           1462      5    25     0       0
doctors2 <- doctors |> mutate(agecat = c(1:5,1:5),
              agesq = agecat^2,
              smoke = c(rep(1,5), rep(0,5)),
              smokage = c(1:5, rep(0,5)))
doctors2
## # A tibble: 10 × 8
##    age      smoking    deaths `person-years` agecat agesq smoke smokage
##    <chr>    <chr>       <dbl>          <dbl>  <int> <dbl> <dbl>   <dbl>
##  1 35 to 44 smoker         32          52407      1     1     1       1
##  2 45 to 54 smoker        104          43248      2     4     1       2
##  3 55 to 64 smoker        206          28612      3     9     1       3
##  4 65 to 74 smoker        186          12663      4    16     1       4
##  5 75 to 84 smoker        102           5317      5    25     1       5
##  6 35 to 44 non-smoker      2          18790      1     1     0       0
##  7 45 to 54 non-smoker     12          10673      2     4     0       0
##  8 55 to 64 non-smoker     28           5710      3     9     0       0
##  9 65 to 74 non-smoker     28           2585      4    16     0       0
## 10 75 to 84 non-smoker     31           1462      5    25     0       0
res.doc<-glm(deaths~agecat + agesq + smoke + smoke:agecat + offset(log(`person-years`)),
             family=poisson(link="log"),data=doctors2)
summary(res.doc)
## 
## Call:
## glm(formula = deaths ~ agecat + agesq + smoke + smoke:agecat + 
##     offset(log(`person-years`)), family = poisson(link = "log"), 
##     data = doctors2)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -10.79176    0.45008 -23.978  < 2e-16 ***
## agecat         2.37648    0.20795  11.428  < 2e-16 ***
## agesq         -0.19768    0.02737  -7.223 5.08e-13 ***
## smoke          1.44097    0.37220   3.872 0.000108 ***
## agecat:smoke  -0.30755    0.09704  -3.169 0.001528 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 935.0673  on 9  degrees of freedom
## Residual deviance:   1.6354  on 5  degrees of freedom
## AIC: 66.703
## 
## Number of Fisher Scoring iterations: 4
summary(res.doc)$coefficients
##                 Estimate Std. Error    z value      Pr(>|z|)
## (Intercept)  -10.7917625 0.45007723 -23.977579 4.766017e-127
## agecat         2.3764783 0.20794860  11.428201  3.023077e-30
## agesq         -0.1976765 0.02736742  -7.223060  5.083068e-13
## smoke          1.4409719 0.37219886   3.871511  1.081629e-04
## agecat:smoke  -0.3075481 0.09704114  -3.169255  1.528305e-03
#Table9.2
tibble(beta_hat=c(res.doc$coefficients[2:5]),
       se_beta_hat=c(summary(res.doc)$coefficients[2:5,2]),
       Wald_statistic=beta_hat/se_beta_hat,
       p_value=2*(1-pnorm(abs(Wald_statistic))),
       Rate_ratio=exp(beta_hat),
       `95%CI_low`=exp(beta_hat+qnorm(0.025)*summary(res.doc)$coefficients[2:5,2]),
       `95%CI_high`=exp(beta_hat+qnorm(0.975)*summary(res.doc)$coefficients[2:5,2])
       )
## # A tibble: 4 × 7
##   beta_hat se_beta_hat Wald_statistic  p_value Rate_ratio `95%CI_low`
##      <dbl>       <dbl>          <dbl>    <dbl>      <dbl>       <dbl>
## 1    2.38       0.208           11.4  0            10.8         7.16 
## 2   -0.198      0.0274          -7.22 5.08e-13      0.821       0.778
## 3    1.44       0.372            3.87 1.08e- 4      4.22        2.04 
## 4   -0.308      0.0970          -3.17 1.53e- 3      0.735       0.608
## # ℹ 1 more variable: `95%CI_high` <dbl>
fit_p=c(fitted(res.doc))
fit_p
##          1          2          3          4          5          6          7 
##  29.584734 106.811960 208.198646 182.827893 102.576767   3.414801  11.541629 
##          8          9         10 
##  24.743377  30.229155  31.071038
pearsonresid<-(doctors$deaths-fit_p)/sqrt(fit_p)
pearsonresid
##           1           2           3           4           5           6 
##  0.44404929 -0.27208163 -0.15237591  0.23459923 -0.05694769 -0.76561908 
##           7           8           9          10 
##  0.13492231  0.65469354 -0.40544060 -0.01274427
chisq<-sum(pearsonresid*pearsonresid)
chisq
## [1] 1.550251
devres<-sign(doctors2$deaths-fit_p)*(sqrt(2*(doctors2$deaths*log(doctors2$deaths/fit_p)-(doctors2$deaths-fit_p))))
devres
##           1           2           3           4           5           6 
##  0.43820403 -0.27328873 -0.15264528  0.23392570 -0.05700118 -0.83049031 
##           7           8           9          10 
##  0.13404370  0.64106682 -0.41058325 -0.01274913
deviance<-sum(devres*devres)
deviance
## [1] 1.63537
#Table9.3
tibble(Age.cat=doctors2$agecat,
       smoking.cat=doctors2$smoke,
       Observed.death=doctors2$deaths,
       Expected.death=fit_p,
       Pearson.residual=pearsonresid,
       Deviance.residual=devres)
## # A tibble: 10 × 6
##    Age.cat smoking.cat Observed.death Expected.death Pearson.residual
##      <int>       <dbl>          <dbl>          <dbl>            <dbl>
##  1       1           1             32          29.6            0.444 
##  2       2           1            104         107.            -0.272 
##  3       3           1            206         208.            -0.152 
##  4       4           1            186         183.             0.235 
##  5       5           1            102         103.            -0.0569
##  6       1           0              2           3.41          -0.766 
##  7       2           0             12          11.5            0.135 
##  8       3           0             28          24.7            0.655 
##  9       4           0             28          30.2           -0.405 
## 10       5           0             31          31.1           -0.0127
## # ℹ 1 more variable: Deviance.residual <dbl>
res.doc$deviance
## [1] 1.63537
res.doc.mini<-glm(deaths~1 + offset(log(`person-years`)),
             family=poisson(link="log"),data=doctors2)
summary(res.doc.mini)
## 
## Call:
## glm(formula = deaths ~ 1 + offset(log(`person-years`)), family = poisson(link = "log"), 
##     data = doctors2)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.51442    0.03698  -149.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 935.07  on 9  degrees of freedom
## Residual deviance: 935.07  on 9  degrees of freedom
## AIC: 992.14
## 
## Number of Fisher Scoring iterations: 5
logLik(res.doc.mini)
## 'log Lik.' -495.0676 (df=1)
logLik(res.doc)
## 'log Lik.' -28.35166 (df=5)
#C

2*(logLik(res.doc)-logLik(res.doc.mini))
## 'log Lik.' 933.432 (df=5)
1-pchisq(933.432, df=4)
## [1] 0
#pseudo R2
(logLik(res.doc.mini)-logLik(res.doc))/logLik(res.doc.mini)
## 'log Lik.' 0.9427318 (df=1)
#Table9.4
melanoma <- structure(list(type = c("hutchinson's melanotic freckle", "hutchinson's melanotic freckle", 
"hutchinson's melanotic freckle", "superficial spreading melanoma", 
"superficial spreading melanoma", "superficial spreading melanoma", 
"nodular", "nodular", "nodular", "indeterminate", "indeterminate", 
"indeterminate"), site = c("head & neck", "trunk", "extremities", 
"head & neck", "trunk", "extremities", "head & neck", "trunk", 
"extremities", "head & neck", "trunk", "extremities"), frequency = c(22, 
2, 10, 16, 54, 115, 19, 33, 73, 11, 17, 28)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -12L))
melanoma
## # A tibble: 12 × 3
##    type                           site        frequency
##    <chr>                          <chr>           <dbl>
##  1 hutchinson's melanotic freckle head & neck        22
##  2 hutchinson's melanotic freckle trunk               2
##  3 hutchinson's melanotic freckle extremities        10
##  4 superficial spreading melanoma head & neck        16
##  5 superficial spreading melanoma trunk              54
##  6 superficial spreading melanoma extremities       115
##  7 nodular                        head & neck        19
##  8 nodular                        trunk              33
##  9 nodular                        extremities        73
## 10 indeterminate                  head & neck        11
## 11 indeterminate                  trunk              17
## 12 indeterminate                  extremities        28
library(tidyverse)
melanoma |> tidyr::pivot_wider(names_from = type, values_from = frequency) |> 
  mutate(sum=`hutchinson's melanotic freckle`+ `superficial spreading melanoma` +nodular+ indeterminate)
## # A tibble: 3 × 6
##   site       hutchinson's melanot…¹ superficial spreadin…² nodular indeterminate
##   <chr>                       <dbl>                  <dbl>   <dbl>         <dbl>
## 1 head & ne…                     22                     16      19            11
## 2 trunk                           2                     54      33            17
## 3 extremiti…                     10                    115      73            28
## # ℹ abbreviated names: ¹​`hutchinson's melanotic freckle`,
## #   ²​`superficial spreading melanoma`
## # ℹ 1 more variable: sum <dbl>
library(tidyverse)
library(gt)
melanoma |> pivot_wider(names_from = type, values_from = frequency) |> 
  mutate(sum=`hutchinson's melanotic freckle`+ `superficial spreading melanoma` +nodular+ indeterminate) |> 
  gt(rowname_col = "site") |> 
  grand_summary_rows(fns = list(fn = "sum"))
hutchinson's melanotic freckle superficial spreading melanoma nodular indeterminate sum
head & neck 22 16 19 11 68
trunk 2 54 33 17 106
extremities 10 115 73 28 226
sum 34 185 125 56 400
melanoma2 <- melanoma |> pivot_wider(names_from = type, values_from = frequency) |> 
  mutate(sum=`hutchinson's melanotic freckle`+ `superficial spreading melanoma` +nodular+ indeterminate) |> 
  gt(rowname_col = "site") |> 
  grand_summary_rows(fns = list(fn = "sum"))
melanoma2
hutchinson's melanotic freckle superficial spreading melanoma nodular indeterminate sum
head & neck 22 16 19 11 68
trunk 2 54 33 17 106
extremities 10 115 73 28 226
sum 34 185 125 56 400
melanoma3 <- melanoma |> pivot_wider(names_from = type, values_from = frequency) |> 
  mutate(sum=`hutchinson's melanotic freckle`+ `superficial spreading melanoma` +nodular+ indeterminate)
melanoma3
## # A tibble: 3 × 6
##   site       hutchinson's melanot…¹ superficial spreadin…² nodular indeterminate
##   <chr>                       <dbl>                  <dbl>   <dbl>         <dbl>
## 1 head & ne…                     22                     16      19            11
## 2 trunk                           2                     54      33            17
## 3 extremiti…                     10                    115      73            28
## # ℹ abbreviated names: ¹​`hutchinson's melanotic freckle`,
## #   ²​`superficial spreading melanoma`
## # ℹ 1 more variable: sum <dbl>
melanoma3[4,] <- list("sum", 34,	185,	125,	56,	400)
melanoma3
## # A tibble: 4 × 6
##   site       hutchinson's melanot…¹ superficial spreadin…² nodular indeterminate
##   <chr>                       <dbl>                  <dbl>   <dbl>         <dbl>
## 1 head & ne…                     22                     16      19            11
## 2 trunk                           2                     54      33            17
## 3 extremiti…                     10                    115      73            28
## 4 sum                            34                    185     125            56
## # ℹ abbreviated names: ¹​`hutchinson's melanotic freckle`,
## #   ²​`superficial spreading melanoma`
## # ℹ 1 more variable: sum <dbl>
#row ratio
melanoma3[4, 2:5]/400
##   hutchinson's melanotic freckle superficial spreading melanoma nodular
## 1                          0.085                         0.4625  0.3125
##   indeterminate
## 1          0.14
#column ratio
melanoma3[1:3, 6]/400
##     sum
## 1 0.170
## 2 0.265
## 3 0.565
str(matrix(unlist(melanoma3[4, 2:5]/400)))
##  num [1:4, 1] 0.085 0.463 0.312 0.14
m1 <- matrix(unlist(melanoma3[4, 2:5]/400), nrow=1)
m1
##       [,1]   [,2]   [,3] [,4]
## [1,] 0.085 0.4625 0.3125 0.14
m2 <- matrix(t(melanoma3[1:3, 6]/400), ncol = 1)
m2
##       [,1]
## [1,] 0.170
## [2,] 0.265
## [3,] 0.565
str(m1)
##  num [1, 1:4] 0.085 0.463 0.312 0.14
str(m2)
##  num [1:3, 1] 0.17 0.265 0.565
m2 %*% m1
##          [,1]      [,2]      [,3]   [,4]
## [1,] 0.014450 0.0786250 0.0531250 0.0238
## [2,] 0.022525 0.1225625 0.0828125 0.0371
## [3,] 0.048025 0.2613125 0.1765625 0.0791
(m2 %*% m1) * 400
##       [,1]    [,2]   [,3]  [,4]
## [1,]  5.78  31.450 21.250  9.52
## [2,]  9.01  49.025 33.125 14.84
## [3,] 19.21 104.525 70.625 31.64
estimated <- (m2 %*% m1) * 400
unlist(t(melanoma3[1:3, 2:5]))
##                                [,1] [,2] [,3]
## hutchinson's melanotic freckle   22    2   10
## superficial spreading melanoma   16   54  115
## nodular                          19   33   73
## indeterminate                    11   17   28
matrix(unlist(t(melanoma3[1:3, 2:5])), ncol=4, byrow = T)
##      [,1] [,2] [,3] [,4]
## [1,]   22   16   19   11
## [2,]    2   54   33   17
## [3,]   10  115   73   28
observed <- matrix(unlist(t(melanoma3[1:3, 2:5])), ncol=4, byrow = T)
#X^2
(observed-estimated)^2/estimated
##           [,1]      [,2]         [,3]      [,4]
## [1,] 45.517024 7.5899046 0.2382352941 0.2300840
## [2,]  5.453951 0.5048572 0.0004716981 0.3143935
## [3,]  4.415622 1.0497548 0.0798672566 0.4187611
sum((observed-estimated)^2/estimated)
## [1] 65.81293
1-pchisq(65.81293, df=6)
## [1] 2.943201e-12
melanoma5 <- melanoma |> mutate(tumor = factor(type, levels = c("hutchinson's melanotic freckle", 
                                                   "superficial spreading melanoma", 
                                                   "nodular",
                                                   "indeterminate")),
                   site2 = factor(site, levels = c("head & neck", 
                                                   "trunk", 
                                                   "extremities"))
                   )
melanoma5
## # A tibble: 12 × 5
##    type                           site        frequency tumor              site2
##    <chr>                          <chr>           <dbl> <fct>              <fct>
##  1 hutchinson's melanotic freckle head & neck        22 hutchinson's mela… head…
##  2 hutchinson's melanotic freckle trunk               2 hutchinson's mela… trunk
##  3 hutchinson's melanotic freckle extremities        10 hutchinson's mela… extr…
##  4 superficial spreading melanoma head & neck        16 superficial sprea… head…
##  5 superficial spreading melanoma trunk              54 superficial sprea… trunk
##  6 superficial spreading melanoma extremities       115 superficial sprea… extr…
##  7 nodular                        head & neck        19 nodular            head…
##  8 nodular                        trunk              33 nodular            trunk
##  9 nodular                        extremities        73 nodular            extr…
## 10 indeterminate                  head & neck        11 indeterminate      head…
## 11 indeterminate                  trunk              17 indeterminate      trunk
## 12 indeterminate                  extremities        28 indeterminate      extr…
ressat.melanoma<-glm(frequency~tumor*site2,family=poisson(),data=melanoma5)
ressat.melanoma
## 
## Call:  glm(formula = frequency ~ tumor * site2, family = poisson(), 
##     data = melanoma5)
## 
## Coefficients:
##                                          (Intercept)  
##                                               3.0910  
##                  tumorsuperficial spreading melanoma  
##                                              -0.3185  
##                                         tumornodular  
##                                              -0.1466  
##                                   tumorindeterminate  
##                                              -0.6931  
##                                           site2trunk  
##                                              -2.3979  
##                                     site2extremities  
##                                              -0.7885  
##       tumorsuperficial spreading melanoma:site2trunk  
##                                               3.6143  
##                              tumornodular:site2trunk  
##                                               2.9500  
##                        tumorindeterminate:site2trunk  
##                                               2.8332  
## tumorsuperficial spreading melanoma:site2extremities  
##                                               2.7608  
##                        tumornodular:site2extremities  
##                                               2.1345  
##                  tumorindeterminate:site2extremities  
##                                               1.7228  
## 
## Degrees of Freedom: 11 Total (i.e. Null);  0 Residual
## Null Deviance:	    295.2 
## Residual Deviance: -1.754e-14 	AIC: 83.11
summary(ressat.melanoma)
## 
## Call:
## glm(formula = frequency ~ tumor * site2, family = poisson(), 
##     data = melanoma5)
## 
## Coefficients:
##                                                      Estimate Std. Error
## (Intercept)                                            3.0910     0.2132
## tumorsuperficial spreading melanoma                   -0.3185     0.3286
## tumornodular                                          -0.1466     0.3132
## tumorindeterminate                                    -0.6931     0.3693
## site2trunk                                            -2.3979     0.7385
## site2extremities                                      -0.7885     0.3814
## tumorsuperficial spreading melanoma:site2trunk         3.6143     0.7915
## tumornodular:site2trunk                                2.9500     0.7927
## tumorindeterminate:site2trunk                          2.8332     0.8338
## tumorsuperficial spreading melanoma:site2extremities   2.7608     0.4655
## tumornodular:site2extremities                          2.1345     0.4602
## tumorindeterminate:site2extremities                    1.7228     0.5216
##                                                      z value Pr(>|z|)    
## (Intercept)                                           14.498  < 2e-16 ***
## tumorsuperficial spreading melanoma                   -0.969 0.332432    
## tumornodular                                          -0.468 0.639712    
## tumorindeterminate                                    -1.877 0.060511 .  
## site2trunk                                            -3.247 0.001167 ** 
## site2extremities                                      -2.067 0.038701 *  
## tumorsuperficial spreading melanoma:site2trunk         4.566 4.96e-06 ***
## tumornodular:site2trunk                                3.721 0.000198 ***
## tumorindeterminate:site2trunk                          3.398 0.000679 ***
## tumorsuperficial spreading melanoma:site2extremities   5.931 3.00e-09 ***
## tumornodular:site2extremities                          4.638 3.52e-06 ***
## tumorindeterminate:site2extremities                    3.303 0.000957 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance:  2.9520e+02  on 11  degrees of freedom
## Residual deviance: -1.7542e-14  on  0  degrees of freedom
## AIC: 83.111
## 
## Number of Fisher Scoring iterations: 3
logLik(ressat.melanoma)
## 'log Lik.' -29.5557 (df=12)
ressat.melanoma.add<-glm(frequency~tumor+site2,family=poisson(),data=melanoma5)
summary(ressat.melanoma.add)
## 
## Call:
## glm(formula = frequency ~ tumor + site2, family = poisson(), 
##     data = melanoma5)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           1.7544     0.2040   8.600  < 2e-16 ***
## tumorsuperficial spreading melanoma   1.6940     0.1866   9.079  < 2e-16 ***
## tumornodular                          1.3020     0.1934   6.731 1.68e-11 ***
## tumorindeterminate                    0.4990     0.2174   2.295  0.02173 *  
## site2trunk                            0.4439     0.1554   2.857  0.00427 ** 
## site2extremities                      1.2010     0.1383   8.683  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 295.203  on 11  degrees of freedom
## Residual deviance:  51.795  on  6  degrees of freedom
## AIC: 122.91
## 
## Number of Fisher Scoring iterations: 5
#D
logLik(ressat.melanoma.add)
## 'log Lik.' -55.45321 (df=6)
2*(logLik(ressat.melanoma)-logLik(ressat.melanoma.add))
## 'log Lik.' 51.79501 (df=12)
ressat.melanoma.add$fitted.values
##       1       2       3       4       5       6       7       8       9      10 
##   5.780   9.010  19.210  31.450  49.025 104.525  21.250  33.125  70.625   9.520 
##      11      12 
##  14.840  31.640
melanoma5$frequency
##  [1]  22   2  10  16  54 115  19  33  73  11  17  28
#chi-square
sum((ressat.melanoma.add$fitted.values-melanoma5$frequency)^2/ressat.melanoma.add$fitted.values)
## [1] 65.81293
ressat.melanoma.mini<-glm(frequency~1,family=poisson(),data=melanoma5)
summary(ressat.melanoma.mini)
## 
## Call:
## glm(formula = frequency ~ 1, family = poisson(), data = melanoma5)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    3.507      0.050   70.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 295.2  on 11  degrees of freedom
## Residual deviance: 295.2  on 11  degrees of freedom
## AIC: 356.31
## 
## Number of Fisher Scoring iterations: 5
#D
logLik(ressat.melanoma.mini)
## 'log Lik.' -177.1572 (df=1)
2*(logLik(ressat.melanoma.add)-logLik(ressat.melanoma.mini))
## 'log Lik.' 243.408 (df=6)
#pseudo R2
(logLik(ressat.melanoma.mini)-logLik(ressat.melanoma))/logLik(ressat.melanoma.mini)
## 'log Lik.' 0.8331668 (df=1)
#Table 9.7
ulcer
## # A tibble: 8 × 4
##   ulcer    `case-control` aspirin  frequency
##   <chr>    <chr>          <chr>        <dbl>
## 1 gastric  control        non-user        62
## 2 gastric  control        user             6
## 3 gastric  case           non-user        39
## 4 gastric  case           user            25
## 5 duodenal control        non-user        53
## 6 duodenal control        user             8
## 7 duodenal case           non-user        49
## 8 duodenal case           user             8
ulcer$ulcer
## [1] "gastric"  "gastric"  "gastric"  "gastric"  "duodenal" "duodenal" "duodenal"
## [8] "duodenal"
#GD ulcer site
#CC case-control status
#AP aspirin
library(tidyverse)
ulcer2 <- ulcer |> mutate(GD = factor(ulcer, levels = c("gastric", "duodenal")),
                          CC = factor(`case-control`, levels = c("control", "case")),
                          AP = factor(aspirin, levels = c("non-user", "user"))
                   )
ulcer2
## # A tibble: 8 × 7
##   ulcer    `case-control` aspirin  frequency GD       CC      AP      
##   <chr>    <chr>          <chr>        <dbl> <fct>    <fct>   <fct>   
## 1 gastric  control        non-user        62 gastric  control non-user
## 2 gastric  control        user             6 gastric  control user    
## 3 gastric  case           non-user        39 gastric  case    non-user
## 4 gastric  case           user            25 gastric  case    user    
## 5 duodenal control        non-user        53 duodenal control non-user
## 6 duodenal control        user             8 duodenal control user    
## 7 duodenal case           non-user        49 duodenal case    non-user
## 8 duodenal case           user             8 duodenal case    user
res1.aspirin<-glm(frequency~GD + CC + GD*CC, family=poisson(), data=ulcer2)
summary(res1.aspirin)
## 
## Call:
## glm(formula = frequency ~ GD + CC + GD * CC, family = poisson(), 
##     data = ulcer2)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        3.526361   0.121268  29.079   <2e-16 ***
## GDduodenal        -0.108634   0.176350  -0.616    0.538    
## CCcase            -0.060625   0.174158  -0.348    0.728    
## GDduodenal:CCcase -0.007198   0.253512  -0.028    0.977    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 127.75  on 7  degrees of freedom
## Residual deviance: 126.71  on 4  degrees of freedom
## AIC: 174.32
## 
## Number of Fisher Scoring iterations: 5
res2.aspirin<-glm(frequency~GD + CC + GD*CC + AP, family=poisson(), data=ulcer2)
summary(res2.aspirin)
## 
## Call:
## glm(formula = frequency ~ GD + CC + GD * CC + AP, family = poisson(), 
##     data = ulcer2)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        4.011253   0.125028  32.083   <2e-16 ***
## GDduodenal        -0.108634   0.176350  -0.616    0.538    
## CCcase            -0.060625   0.174158  -0.348    0.728    
## APuser            -1.463058   0.161872  -9.038   <2e-16 ***
## GDduodenal:CCcase -0.007198   0.253512  -0.028    0.977    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 127.749  on 7  degrees of freedom
## Residual deviance:  21.789  on 3  degrees of freedom
## AIC: 71.404
## 
## Number of Fisher Scoring iterations: 5
res3.aspirin<-glm(frequency~GD + CC + GD*CC + AP + AP*CC,family=poisson(), data=ulcer2)
summary(res3.aspirin)
## 
## Call:
## glm(formula = frequency ~ GD + CC + GD * CC + AP + AP * CC, family = poisson(), 
##     data = ulcer2)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        4.104627   0.125098  32.811  < 2e-16 ***
## GDduodenal        -0.108634   0.176350  -0.616  0.53789    
## CCcase            -0.264198   0.185402  -1.425  0.15416    
## APuser            -2.105875   0.283062  -7.440 1.01e-13 ***
## GDduodenal:CCcase -0.007198   0.253511  -0.028  0.97735    
## CCcase:APuser      1.125046   0.348984   3.224  0.00127 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 127.749  on 7  degrees of freedom
## Residual deviance:  10.538  on 2  degrees of freedom
## AIC: 62.153
## 
## Number of Fisher Scoring iterations: 4
res4.aspirin<-glm(frequency~GD + CC + GD*CC + AP + AP*CC + AP*GD, family=poisson(), data=ulcer2)
summary(res4.aspirin)
## 
## Call:
## glm(formula = frequency ~ GD + CC + GD * CC + AP + AP * CC + 
##     AP * GD, family = poisson(), data = ulcer2)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        4.06961    0.12862  31.640  < 2e-16 ***
## GDduodenal        -0.03598    0.18042  -0.199  0.84195    
## CCcase            -0.32091    0.19384  -1.656  0.09782 .  
## APuser            -1.82193    0.30796  -5.916  3.3e-09 ***
## GDduodenal:CCcase  0.10574    0.26147   0.404  0.68590    
## CCcase:APuser      1.14288    0.35207   3.246  0.00117 ** 
## GDduodenal:APuser -0.70005    0.34603  -2.023  0.04306 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 127.749  on 7  degrees of freedom
## Residual deviance:   6.283  on 1  degrees of freedom
## AIC: 59.898
## 
## Number of Fisher Scoring iterations: 4
#Table 9.11
tibble(Models=c("GD+CC+GDXCC",
                "GD+CC+GDXCC+AP",
                "GD+CC+GDXCC+AP+APXCC",
                "GD+CC+GDXCC+AP+APXCC++APXGD"),
       D.F.=c(res1.aspirin$df.residual,
              res2.aspirin$df.residual,
              res3.aspirin$df.residual,
              res4.aspirin$df.residual),
       Deviance=c(res1.aspirin$deviance,
                  res2.aspirin$deviance,
                  res3.aspirin$deviance,
                  res4.aspirin$deviance)
       ) |> as.data.frame()
##                        Models D.F.   Deviance
## 1                 GD+CC+GDXCC    4 126.707890
## 2              GD+CC+GDXCC+AP    3  21.789257
## 3        GD+CC+GDXCC+AP+APXCC    2  10.538439
## 4 GD+CC+GDXCC+AP+APXCC++APXGD    1   6.282983
#delta D comparison of aspirin use between cases and controls
res2.aspirin$deviance - res3.aspirin$deviance
## [1] 11.25082
(1-pchisq(11.25082, df=1))
## [1] 0.0007958785
#delta D, difference between ulcer sites
res3.aspirin$deviance - res4.aspirin$deviance
## [1] 4.255456
(1-pchisq(4.255456, df=1))
## [1] 0.03912444
#chi-square
sum((res4.aspirin$fitted.values-ulcer2$frequency)^2/res4.aspirin$fitted.values)
## [1] 6.48795
(1-pchisq(6.48795, df=1))
## [1] 0.01086082
res4.aspirin$fitted.values
##         1         2         3         4         5         6         7         8 
## 58.534241  9.465759 42.465759 21.534241 56.465759  4.534241 45.534241 11.465759
res4.aspirin$deviance
## [1] 6.282983

Exercises 9.1

The joint probability distribution: \[f(\mathbf y;\boldsymbol\mu)=\prod_{i=1}^{N}\mu_i^{y_i}e^{-\mu_i}/y_i!\]

\[l(\mathbf y;\boldsymbol\mu)=\sum_{i=1}^{N}(y_i\log(\mu_i)-\mu_i-\log(y_i!))=\sum_{i=1}^{N}\left(y_i\left(\beta_1+\sum_{j=2}^{J}x_{ij}\beta_j\right)-\exp\left(\beta_1+\sum_{j=2}^{J}x_{ij}\beta_j\right)-\log(y_i!)\right)\] The score statistic for \(\beta_1\) is \[\frac{\partial l(\mathbf y;\boldsymbol\mu)}{\partial \beta_1}=\sum_{i=1}^{N}\left(y_i-\mu_i\right)\]

#Exercises 9.2
insurance
## # A tibble: 32 × 5
##      car   age district     y     n
##    <dbl> <dbl>    <dbl> <dbl> <dbl>
##  1     1     1        0    65   317
##  2     1     2        0    65   476
##  3     1     3        0    52   486
##  4     1     4        0   310  3259
##  5     2     1        0    98   486
##  6     2     2        0   159  1004
##  7     2     3        0   175  1355
##  8     2     4        0   877  7660
##  9     3     1        0    41   223
## 10     3     2        0   117   539
## 11     3     3        0   137   697
## 12     3     4        0   477  3442
## 13     4     1        0    11    40
## 14     4     2        0    35   148
## 15     4     3        0    39   214
## 16     4     4        0   167  1019
## 17     1     1        1     2    20
## 18     1     2        1     5    33
## 19     1     3        1     4    40
## 20     1     4        1    36   316
## 21     2     1        1     7    31
## 22     2     2        1    10    81
## 23     2     3        1    22   122
## 24     2     4        1   102   724
## 25     3     1        1     5    18
## # ℹ 7 more rows
insurance$car2 <- factor(insurance$car)
insurance$age2 <- factor(insurance$age)
insurance$district2 <- factor(insurance$district)
insurance |> group_by(c(car2)) |> mutate(rate=sum(y)/sum(n))
## # A tibble: 32 × 10
## # Groups:   c(car2) [4]
##      car   age district     y     n car2  age2  district2 `c(car2)`  rate
##    <dbl> <dbl>    <dbl> <dbl> <dbl> <fct> <fct> <fct>     <fct>     <dbl>
##  1     1     1        0    65   317 1     1     0         1         0.109
##  2     1     2        0    65   476 1     2     0         1         0.109
##  3     1     3        0    52   486 1     3     0         1         0.109
##  4     1     4        0   310  3259 1     4     0         1         0.109
##  5     2     1        0    98   486 2     1     0         2         0.126
##  6     2     2        0   159  1004 2     2     0         2         0.126
##  7     2     3        0   175  1355 2     3     0         2         0.126
##  8     2     4        0   877  7660 2     4     0         2         0.126
##  9     3     1        0    41   223 3     1     0         3         0.161
## 10     3     2        0   117   539 3     2     0         3         0.161
## 11     3     3        0   137   697 3     3     0         3         0.161
## 12     3     4        0   477  3442 3     4     0         3         0.161
## 13     4     1        0    11    40 4     1     0         4         0.189
## 14     4     2        0    35   148 4     2     0         4         0.189
## 15     4     3        0    39   214 4     3     0         4         0.189
## 16     4     4        0   167  1019 4     4     0         4         0.189
## 17     1     1        1     2    20 1     1     1         1         0.109
## 18     1     2        1     5    33 1     2     1         1         0.109
## 19     1     3        1     4    40 1     3     1         1         0.109
## 20     1     4        1    36   316 1     4     1         1         0.109
## 21     2     1        1     7    31 2     1     1         2         0.126
## 22     2     2        1    10    81 2     2     1         2         0.126
## 23     2     3        1    22   122 2     3     1         2         0.126
## 24     2     4        1   102   724 2     4     1         2         0.126
## 25     3     1        1     5    18 3     1     1         3         0.161
## # ℹ 7 more rows
insurance |> ggplot(aes(x=car2, y=y/n)) + 
  geom_point(size=3, aes(color=age2)) +
  facet_grid(~district2) +
  scale_colour_prism(palette = "candy_bright") +
  theme_prism(border = TRUE, 
              palette = "black_and_white", 
              base_size = 12) +
  labs(x="cars", y="rate")

res.insurance<-glm(y/n~car2+age2+district2+car2*age2+car2*district2+age2*district2, family=quasipoisson(), data=insurance)
summary(res.insurance)
## 
## Call:
## glm(formula = y/n ~ car2 + age2 + district2 + car2 * age2 + car2 * 
##     district2 + age2 * district2, family = quasipoisson(), data = insurance)
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)      -1.63176    0.39059  -4.178  0.00238 **
## car22             0.22917    0.50283   0.456  0.65936   
## car23             0.28810    0.49292   0.584  0.57325   
## car24            -0.18154    0.54274  -0.334  0.74568   
## age22            -0.25468    0.53999  -0.472  0.64840   
## age23            -0.67888    0.59357  -1.144  0.28225   
## age24            -0.69793    0.59799  -1.167  0.27315   
## district21       -0.58042    0.47743  -1.216  0.25502   
## car22:age22      -0.39160    0.69271  -0.565  0.58567   
## car23:age22      -0.13194    0.65924  -0.200  0.84582   
## car24:age22       0.83270    0.67478   1.234  0.24844   
## car22:age23       0.01957    0.72284   0.027  0.97899   
## car23:age23       0.26895    0.69355   0.388  0.70718   
## car24:age23       0.95684    0.72306   1.323  0.21836   
## car22:age24      -0.18616    0.74021  -0.251  0.80708   
## car23:age24      -0.03957    0.71448  -0.055  0.95705   
## car24:age24       0.84184    0.72819   1.156  0.27742   
## car22:district21  0.27647    0.52431   0.527  0.61074   
## car23:district21  0.31835    0.49950   0.637  0.53978   
## car24:district21  0.20342    0.48888   0.416  0.68709   
## age22:district21  0.47513    0.45559   1.043  0.32420   
## age23:district21  0.66356    0.46475   1.428  0.18712   
## age24:district21  0.71833    0.48559   1.479  0.17319   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 0.03758626)
## 
##     Null deviance: 1.10956  on 31  degrees of freedom
## Residual deviance: 0.43626  on  9  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
res.insurance2<-glm(y/n~car+age+district, family=quasipoisson(), data=insurance)
summary(res.insurance2)
## 
## Call:
## glm(formula = y/n ~ car + age + district, family = quasipoisson(), 
##     data = insurance)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.14870    0.20948 -10.257 5.48e-11 ***
## car          0.19999    0.05271   3.794 0.000727 ***
## age         -0.05696    0.05190  -1.098 0.281733    
## district     0.08896    0.11600   0.767 0.449551    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 0.01926316)
## 
##     Null deviance: 1.10956  on 31  degrees of freedom
## Residual deviance: 0.79295  on 28  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4
#Exercises 9.3
vaccine
## # A tibble: 6 × 3
##   treatment response frequency
##   <chr>     <chr>        <dbl>
## 1 placebo   small           25
## 2 placebo   moderate         8
## 3 placebo   large            5
## 4 vaccine   small            6
## 5 vaccine   moderate        18
## 6 vaccine   large           11
vaccine.frequency <- matrix(c(vaccine$frequency), ncol = 2, byrow = FALSE)
vaccine.frequency
##      [,1] [,2]
## [1,]   25    6
## [2,]    8   18
## [3,]    5   11
chisq.test(vaccine.frequency)
## 
## 	Pearson's Chi-squared test
## 
## data:  vaccine.frequency
## X-squared = 17.648, df = 2, p-value = 0.0001472
res.vaccine<-glm(frequency~treatment+response,family=poisson(),data=vaccine)
summary(res.vaccine)
## 
## Call:
## glm(formula = frequency ~ treatment + response, family = poisson(), 
##     data = vaccine)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       2.11972    0.27408   7.734 1.04e-14 ***
## treatmentvaccine -0.08224    0.23428  -0.351   0.7256    
## responsemoderate  0.48551    0.31774   1.528   0.1265    
## responsesmall     0.66140    0.30783   2.149   0.0317 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 23.807  on 5  degrees of freedom
## Residual deviance: 18.643  on 2  degrees of freedom
## AIC: 51.771
## 
## Number of Fisher Scoring iterations: 5
res.vaccine$fitted.values
##         1         2         3         4         5         6 
## 16.136986 13.534247  8.328767 14.863014 12.465753  7.671233
vaccine$frequency
## [1] 25  8  5  6 18 11
sum((vaccine$frequency-res.vaccine$fitted.values)^2/res.vaccine$fitted.values)
## [1] 17.64783
chisq.stat <- sum((vaccine$frequency-res.vaccine$fitted.values)^2/res.vaccine$fitted.values)
1-pchisq(chisq.stat, df=res.vaccine$df.residual)
## [1] 0.0001471709
res.vaccine$deviance
## [1] 18.64253
res.vaccine$residuals
##          1          2          3          4          5          6 
##  0.5492360 -0.4089069 -0.3996711 -0.5963134  0.4439560  0.4339286
#For the placebo group the estimated probabilities for the ‘small’, ‘moderate’ and ‘large’ responses are
res.vaccine$coefficients
##      (Intercept) treatmentvaccine responsemoderate    responsesmall 
##        2.1197154       -0.0822381        0.4855078        0.6613985
small <- exp(res.vaccine$coefficients[4])
moderate <- exp(res.vaccine$coefficients[3])
total <- sum(small, moderate, 1)
#small
small/total
## responsesmall 
##     0.4246575
#moderate
moderate/total
## responsemoderate 
##        0.3561644
#large
1/total
## [1] 0.2191781

Chapter10 Survival Analysis

#Table10.1
remission
## # A tibble: 42 × 3
##     time group censored
##    <dbl> <chr>    <dbl>
##  1     1 C            0
##  2     1 C            0
##  3     2 C            0
##  4     2 C            0
##  5     3 C            0
##  6     4 C            0
##  7     4 C            0
##  8     5 C            0
##  9     5 C            0
## 10     8 C            0
## 11     8 C            0
## 12     8 C            0
## 13     8 C            0
## 14    11 C            0
## 15    11 C            0
## 16    12 C            0
## 17    12 C            0
## 18    15 C            0
## 19    17 C            0
## 20    22 C            0
## 21    23 C            0
## 22     6 T            0
## 23     6 T            0
## 24     6 T            0
## 25     6 T            1
## # ℹ 17 more rows
remission |> ggplot(aes(x=time)) +
  geom_bar(stat = "count") + 
  facet_wrap(vars(group)) + theme_bw() + theme(
  # Hide panel borders and remove grid lines
  #panel.border = element_line(colour = "black"),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  # Change axis line
  #axis.line = element_line(colour = "black")
)

#Table10.2
library(tidyverse)
data.frame(
  n=c(21,21,17,15,12,11,7,6),
  d=c(0,3,1,1,1,1,1,1)
) |> mutate(s=(n-d)/n,
            S=cumprod(s))
##    n d         s         S
## 1 21 0 1.0000000 1.0000000
## 2 21 3 0.8571429 0.8571429
## 3 17 1 0.9411765 0.8067227
## 4 15 1 0.9333333 0.7529412
## 5 12 1 0.9166667 0.6901961
## 6 11 1 0.9090909 0.6274510
## 7  7 1 0.8571429 0.5378151
## 8  6 1 0.8333333 0.4481793
survival_treatment <- data.frame(
  n=c(21,21,17,15,12,11,7,6),
  d=c(0,3,1,1,1,1,1,1)
) |> mutate(time = c(0,6,7,10,13,16,22,23),
            treatment = rep("treatment", 8),
            s=(n-d)/n,
            S=cumprod(s))
survival_treatment
##    n d time treatment         s         S
## 1 21 0    0 treatment 1.0000000 1.0000000
## 2 21 3    6 treatment 0.8571429 0.8571429
## 3 17 1    7 treatment 0.9411765 0.8067227
## 4 15 1   10 treatment 0.9333333 0.7529412
## 5 12 1   13 treatment 0.9166667 0.6901961
## 6 11 1   16 treatment 0.9090909 0.6274510
## 7  7 1   22 treatment 0.8571429 0.5378151
## 8  6 1   23 treatment 0.8333333 0.4481793
#
survival_control <- data.frame(
  n=c(21,21,19,17,16,14,12,8,6,4,3,2,1),
  d=c(0,2,2,1,2,2,4,2,2,1,1,1,1)
) |> mutate(time = c(0,1,2,3,4,5,8,11,12,15,17,22,23),
            treatment = rep("control", 13),
            s=(n-d)/n,
            S=cumprod(s))
survival_control
##     n d time treatment         s          S
## 1  21 0    0   control 1.0000000 1.00000000
## 2  21 2    1   control 0.9047619 0.90476190
## 3  19 2    2   control 0.8947368 0.80952381
## 4  17 1    3   control 0.9411765 0.76190476
## 5  16 2    4   control 0.8750000 0.66666667
## 6  14 2    5   control 0.8571429 0.57142857
## 7  12 4    8   control 0.6666667 0.38095238
## 8   8 2   11   control 0.7500000 0.28571429
## 9   6 2   12   control 0.6666667 0.19047619
## 10  4 1   15   control 0.7500000 0.14285714
## 11  3 1   17   control 0.6666667 0.09523810
## 12  2 1   22   control 0.5000000 0.04761905
## 13  1 1   23   control 0.0000000 0.00000000
data <- rbind(survival_control, survival_treatment)
data
##     n d time treatment         s          S
## 1  21 0    0   control 1.0000000 1.00000000
## 2  21 2    1   control 0.9047619 0.90476190
## 3  19 2    2   control 0.8947368 0.80952381
## 4  17 1    3   control 0.9411765 0.76190476
## 5  16 2    4   control 0.8750000 0.66666667
## 6  14 2    5   control 0.8571429 0.57142857
## 7  12 4    8   control 0.6666667 0.38095238
## 8   8 2   11   control 0.7500000 0.28571429
## 9   6 2   12   control 0.6666667 0.19047619
## 10  4 1   15   control 0.7500000 0.14285714
## 11  3 1   17   control 0.6666667 0.09523810
## 12  2 1   22   control 0.5000000 0.04761905
## 13  1 1   23   control 0.0000000 0.00000000
## 14 21 0    0 treatment 1.0000000 1.00000000
## 15 21 3    6 treatment 0.8571429 0.85714286
## 16 17 1    7 treatment 0.9411765 0.80672269
## 17 15 1   10 treatment 0.9333333 0.75294118
## 18 12 1   13 treatment 0.9166667 0.69019608
## 19 11 1   16 treatment 0.9090909 0.62745098
## 20  7 1   22 treatment 0.8571429 0.53781513
## 21  6 1   23 treatment 0.8333333 0.44817927
data |> ggplot(aes(x=time, y=S)) + geom_step(aes(linetype = treatment)) + theme_bw() + theme(
  # Hide panel borders and remove grid lines
  #panel.border = element_line(colour = "black"),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  # Change axis line
  #axis.line = element_line(colour = "black")
) + theme_prism(border = TRUE, 
              palette = "black_and_white", 
              base_size = 12) 

#cumulative hazard function
data |> mutate(logH=log(-log(S)),
               logY=log(time))
##     n d time treatment         s          S        logH      logY
## 1  21 0    0   control 1.0000000 1.00000000        -Inf      -Inf
## 2  21 2    1   control 0.9047619 0.90476190 -2.30175086 0.0000000
## 3  19 2    2   control 0.8947368 0.80952381 -1.55443332 0.6931472
## 4  17 1    3   control 0.9411765 0.76190476 -1.30219694 1.0986123
## 5  16 2    4   control 0.8750000 0.66666667 -0.90272046 1.3862944
## 6  14 2    5   control 0.8571429 0.57142857 -0.58050482 1.6094379
## 7  12 4    8   control 0.6666667 0.38095238 -0.03554335 2.0794415
## 8   8 2   11   control 0.7500000 0.28571429  0.22535149 2.3978953
## 9   6 2   12   control 0.6666667 0.19047619  0.50574961 2.4849066
## 10  4 1   15   control 0.7500000 0.14285714  0.66572981 2.7080502
## 11  3 1   17   control 0.6666667 0.09523810  0.85500037 2.8332133
## 12  2 1   22   control 0.5000000 0.04761905  1.11334405 3.0910425
## 13  1 1   23   control 0.0000000 0.00000000         Inf 3.1354942
## 14 21 0    0 treatment 1.0000000 1.00000000        -Inf      -Inf
## 15 21 3    6 treatment 0.8571429 0.85714286 -1.86982471 1.7917595
## 16 17 1    7 treatment 0.9411765 0.80672269 -1.53816291 1.9459101
## 17 15 1   10 treatment 0.9333333 0.75294118 -1.25959767 2.3025851
## 18 12 1   13 treatment 0.9166667 0.69019608 -0.99214760 2.5649494
## 19 11 1   16 treatment 0.9090909 0.62745098 -0.76337711 2.7725887
## 20  7 1   22 treatment 0.8571429 0.53781513 -0.47764812 3.0910425
## 21  6 1   23 treatment 0.8333333 0.44817927 -0.21994621 3.1354942
data |> mutate(logH=log(-log(S)),
               logY=log(time)) |> ggplot(aes(x=logY, y=logH)) + geom_point(aes(shape = treatment), size=3) + 
  theme_bw() + theme(
  # Hide panel borders and remove grid lines
  #panel.border = element_line(colour = "black"),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  # Change axis line
  #axis.line = element_line(colour = "black")
)

remission
## # A tibble: 42 × 3
##     time group censored
##    <dbl> <chr>    <dbl>
##  1     1 C            0
##  2     1 C            0
##  3     2 C            0
##  4     2 C            0
##  5     3 C            0
##  6     4 C            0
##  7     4 C            0
##  8     5 C            0
##  9     5 C            0
## 10     8 C            0
## 11     8 C            0
## 12     8 C            0
## 13     8 C            0
## 14    11 C            0
## 15    11 C            0
## 16    12 C            0
## 17    12 C            0
## 18    15 C            0
## 19    17 C            0
## 20    22 C            0
## 21    23 C            0
## 22     6 T            0
## 23     6 T            0
## 24     6 T            0
## 25     6 T            1
## # ℹ 17 more rows
#the exponential model can be fitted:
res.gehanexp<-glm(censored==0~group + offset(log(time)), family=poisson(), 
                  data=remission)
summary(res.gehanexp)
## 
## Call:
## glm(formula = censored == 0 ~ group + offset(log(time)), family = poisson(), 
##     data = remission)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.1595     0.2182  -9.896  < 2e-16 ***
## groupT       -1.5266     0.3984  -3.832 0.000127 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 54.503  on 41  degrees of freedom
## Residual deviance: 38.017  on 40  degrees of freedom
## AIC: 102.02
## 
## Number of Fisher Scoring iterations: 6
#AIC
AIC(res.gehanexp)
## [1] 102.0173
-2* logLik(res.gehanexp)+2*2
## 'log Lik.' 102.0173 (df=2)
log(remission$time)
##  [1] 0.0000000 0.0000000 0.6931472 0.6931472 1.0986123 1.3862944 1.3862944
##  [8] 1.6094379 1.6094379 2.0794415 2.0794415 2.0794415 2.0794415 2.3978953
## [15] 2.3978953 2.4849066 2.4849066 2.7080502 2.8332133 3.0910425 3.1354942
## [22] 1.7917595 1.7917595 1.7917595 1.7917595 1.9459101 2.1972246 2.3025851
## [29] 2.3025851 2.3978953 2.5649494 2.7725887 2.8332133 2.9444390 2.9957323
## [36] 3.0910425 3.1354942 3.2188758 3.4657359 3.4657359 3.5263605 3.5553481
offset(log(remission$time))
##  [1] 0.0000000 0.0000000 0.6931472 0.6931472 1.0986123 1.3862944 1.3862944
##  [8] 1.6094379 1.6094379 2.0794415 2.0794415 2.0794415 2.0794415 2.3978953
## [15] 2.3978953 2.4849066 2.4849066 2.7080502 2.8332133 3.0910425 3.1354942
## [22] 1.7917595 1.7917595 1.7917595 1.7917595 1.9459101 2.1972246 2.3025851
## [29] 2.3025851 2.3978953 2.5649494 2.7725887 2.8332133 2.9444390 2.9957323
## [36] 3.0910425 3.1354942 3.2188758 3.4657359 3.4657359 3.5263605 3.5553481
mu <- res.gehanexp$fitted.values
y <- remission$time
m <- sum(remission$censored == 0)
delta <- (remission$censored * -1)+1
#lambda_hat
m/sum((mu-delta)*log(y), na.rm = TRUE)
## [1] 1.631587
library(survival)
Surv(remission$time,remission$censored==0)
##  [1]  1   1   2   2   3   4   4   5   5   8   8   8   8  11  11  12  12  15  17 
## [20] 22  23   6   6   6   6+  7   9+ 10  10+ 11+ 13  16  17+ 19+ 20+ 22  23  25+
## [39] 32+ 32+ 34+ 35+
res.gehan<-survreg(Surv(time,censored==0)~group, dist="exponential", data=remission)
summary(res.gehan)
## 
## Call:
## survreg(formula = Surv(time, censored == 0) ~ group, data = remission, 
##     dist = "exponential")
##             Value Std. Error    z       p
## (Intercept) 2.159      0.218 9.90 < 2e-16
## groupT      1.527      0.398 3.83 0.00013
## 
## Scale fixed at 1 
## 
## Exponential distribution
## Loglik(model)= -108.5   Loglik(intercept only)= -116.8
## 	Chisq= 16.49 on 1 degrees of freedom, p= 4.9e-05 
## Number of Newton-Raphson Iterations: 4 
## n= 42
#AIC
AIC(res.gehan)
## [1] 221.0481
-2* logLik(res.gehan)+2*2
## 'log Lik.' 221.0481 (df=2)
res.gehan.weibull <-survreg(Surv(time,censored==0)~group,dist="weibull", data=remission)
summary(res.gehan.weibull)
## 
## Call:
## survreg(formula = Surv(time, censored == 0) ~ group, data = remission, 
##     dist = "weibull")
##              Value Std. Error     z       p
## (Intercept)  2.248      0.166 13.55 < 2e-16
## groupT       1.267      0.311  4.08 4.5e-05
## Log(scale)  -0.312      0.147 -2.12   0.034
## 
## Scale= 0.732 
## 
## Weibull distribution
## Loglik(model)= -106.6   Loglik(intercept only)= -116.4
## 	Chisq= 19.65 on 1 degrees of freedom, p= 9.3e-06 
## Number of Newton-Raphson Iterations: 5 
## n= 42
res.gehan.weibull$scale
## [1] 0.7321944
AIC(res.gehan.weibull)
## [1] 219.159
#leukemia_survival <- read.csv("leukemia_survival.csv", sep=" ", header = FALSE)
#colnames(leukemia_survival) <- c("Survival.time","White.blood.cell","Survival.time","White.blood.cell")
#leukemia_survival2 <- rbind(leukemia_survival[,1:2], leukemia_survival[,3:4])
#leukemia_survival2$AG <- c(rep("positive", 17), rep("negative", 17))
#leukemia_survival2
#Table10.4
leukemia_survival <- structure(list(Survival.time = c(65L, 156L, 100L, 134L, 16L, 
108L, 121L, 4L, 39L, 143L, 56L, 26L, 22L, 1L, 1L, 5L, 65L, 56L, 
65L, 17L, 7L, 16L, 22L, 3L, 4L, 2L, 3L, 8L, 4L, 3L, 30L, 4L, 
43L, NA), White.blood.cell = c(2.3, 0.75, 4.3, 2.6, 6, 10.5, 
10, 17, 5.4, 7, 9.4, 32, 35, 100, 100, 52, 100, 4.4, 3, 4, 1.5, 
9, 5.3, 10, 19, 27, 28, 31, 26, 21, 79, 100, 100, NA), AG = c("positive", 
"positive", "positive", "positive", "positive", "positive", "positive", 
"positive", "positive", "positive", "positive", "positive", "positive", 
"positive", "positive", "positive", "positive", "negative", "negative", 
"negative", "negative", "negative", "negative", "negative", "negative", 
"negative", "negative", "negative", "negative", "negative", "negative", 
"negative", "negative", "negative")), row.names = c(NA, -34L), class = "data.frame")
leukemia_survival
##    Survival.time White.blood.cell       AG
## 1             65             2.30 positive
## 2            156             0.75 positive
## 3            100             4.30 positive
## 4            134             2.60 positive
## 5             16             6.00 positive
## 6            108            10.50 positive
## 7            121            10.00 positive
## 8              4            17.00 positive
## 9             39             5.40 positive
## 10           143             7.00 positive
## 11            56             9.40 positive
## 12            26            32.00 positive
## 13            22            35.00 positive
## 14             1           100.00 positive
## 15             1           100.00 positive
## 16             5            52.00 positive
## 17            65           100.00 positive
## 18            56             4.40 negative
## 19            65             3.00 negative
## 20            17             4.00 negative
## 21             7             1.50 negative
## 22            16             9.00 negative
## 23            22             5.30 negative
## 24             3            10.00 negative
## 25             4            19.00 negative
## 26             2            27.00 negative
## 27             3            28.00 negative
## 28             8            31.00 negative
## 29             4            26.00 negative
## 30             3            21.00 negative
## 31            30            79.00 negative
## 32             4           100.00 negative
## 33            43           100.00 negative
## 34            NA               NA negative
#Exercises 10.1
survival_AG_positive <- data.frame(
  n=c(17,17,15,14,13,12,11,10,9,8,6,5,4,3,2,1),
  d=c(0,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1) #"0to1", "1to4", "4to5", "5to16", "16to22", "22to26","26to39","39to56", "56to65","65to100","100to108","108to121","121to134","134to143","143to156","156to..."
) |> mutate(time = c(0, unique(sort(leukemia_survival[leukemia_survival$AG == "positive",]$Survival.time))),
            AG = rep("positive", 16),
            s=(n-d)/n,
            S=cumprod(s))
survival_AG_positive
##     n d time       AG         s          S
## 1  17 0    0 positive 1.0000000 1.00000000
## 2  17 2    1 positive 0.8823529 0.88235294
## 3  15 1    4 positive 0.9333333 0.82352941
## 4  14 1    5 positive 0.9285714 0.76470588
## 5  13 1   16 positive 0.9230769 0.70588235
## 6  12 1   22 positive 0.9166667 0.64705882
## 7  11 1   26 positive 0.9090909 0.58823529
## 8  10 1   39 positive 0.9000000 0.52941176
## 9   9 1   56 positive 0.8888889 0.47058824
## 10  8 2   65 positive 0.7500000 0.35294118
## 11  6 1  100 positive 0.8333333 0.29411765
## 12  5 1  108 positive 0.8000000 0.23529412
## 13  4 1  121 positive 0.7500000 0.17647059
## 14  3 1  134 positive 0.6666667 0.11764706
## 15  2 1  143 positive 0.5000000 0.05882353
## 16  1 1  156 positive 0.0000000 0.00000000
survival_AG_negative <- data.frame(
  n=c(16,16,15,12,9,8,7,6,5,4,3,2,1),
  d=c(0,1,3,3,1,1,1,1,1,1,1,1,1) #"0to2", 2to3", "3to4", "4to7", "7to8", "8to16", "16to17","17to22","22to30", "30to43","43to56","56to65","65to..."
) |> mutate(time = c(0, unique(sort(leukemia_survival[leukemia_survival$AG == "negative",]$Survival.time))),
            AG = rep("negative", 13),
            s=(n-d)/n,
            S=cumprod(s))
survival_AG_negative
##     n d time       AG         s      S
## 1  16 0    0 negative 1.0000000 1.0000
## 2  16 1    2 negative 0.9375000 0.9375
## 3  15 3    3 negative 0.8000000 0.7500
## 4  12 3    4 negative 0.7500000 0.5625
## 5   9 1    7 negative 0.8888889 0.5000
## 6   8 1    8 negative 0.8750000 0.4375
## 7   7 1   16 negative 0.8571429 0.3750
## 8   6 1   17 negative 0.8333333 0.3125
## 9   5 1   22 negative 0.8000000 0.2500
## 10  4 1   30 negative 0.7500000 0.1875
## 11  3 1   43 negative 0.6666667 0.1250
## 12  2 1   56 negative 0.5000000 0.0625
## 13  1 1   65 negative 0.0000000 0.0000
survival_AG <- rbind(survival_AG_positive, survival_AG_negative)
survival_AG
##     n d time       AG         s          S
## 1  17 0    0 positive 1.0000000 1.00000000
## 2  17 2    1 positive 0.8823529 0.88235294
## 3  15 1    4 positive 0.9333333 0.82352941
## 4  14 1    5 positive 0.9285714 0.76470588
## 5  13 1   16 positive 0.9230769 0.70588235
## 6  12 1   22 positive 0.9166667 0.64705882
## 7  11 1   26 positive 0.9090909 0.58823529
## 8  10 1   39 positive 0.9000000 0.52941176
## 9   9 1   56 positive 0.8888889 0.47058824
## 10  8 2   65 positive 0.7500000 0.35294118
## 11  6 1  100 positive 0.8333333 0.29411765
## 12  5 1  108 positive 0.8000000 0.23529412
## 13  4 1  121 positive 0.7500000 0.17647059
## 14  3 1  134 positive 0.6666667 0.11764706
## 15  2 1  143 positive 0.5000000 0.05882353
## 16  1 1  156 positive 0.0000000 0.00000000
## 17 16 0    0 negative 1.0000000 1.00000000
## 18 16 1    2 negative 0.9375000 0.93750000
## 19 15 3    3 negative 0.8000000 0.75000000
## 20 12 3    4 negative 0.7500000 0.56250000
## 21  9 1    7 negative 0.8888889 0.50000000
## 22  8 1    8 negative 0.8750000 0.43750000
## 23  7 1   16 negative 0.8571429 0.37500000
## 24  6 1   17 negative 0.8333333 0.31250000
## 25  5 1   22 negative 0.8000000 0.25000000
## 26  4 1   30 negative 0.7500000 0.18750000
## 27  3 1   43 negative 0.6666667 0.12500000
## 28  2 1   56 negative 0.5000000 0.06250000
## 29  1 1   65 negative 0.0000000 0.00000000
survival_AG |> ggplot(aes(x=time, y=S)) + geom_step(aes(linetype = AG)) + theme_bw() + theme(
  # Hide panel borders and remove grid lines
  #panel.border = element_line(colour = "black"),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  # Change axis line
  #axis.line = element_line(colour = "black")
) + theme_prism(border = TRUE, 
              palette = "black_and_white", 
              base_size = 12) 

#cumulative hazard function
survival_AG |> mutate(logH=log(-log(S)),
               logY=log(time))
##     n d time       AG         s          S        logH      logY
## 1  17 0    0 positive 1.0000000 1.00000000        -Inf      -Inf
## 2  17 2    1 positive 0.8823529 0.88235294 -2.07813725 0.0000000
## 3  15 1    4 positive 0.9333333 0.82352941 -1.63909324 1.3862944
## 4  14 1    5 positive 0.9285714 0.76470588 -1.31578376 1.6094379
## 5  13 1   16 positive 0.9230769 0.70588235 -1.05467188 2.7725887
## 6  12 1   22 positive 0.9166667 0.64705882 -0.83167832 3.0910425
## 7  11 1   26 positive 0.9090909 0.58823529 -0.63369360 3.2580965
## 8  10 1   39 positive 0.9000000 0.52941176 -0.45257438 3.6635616
## 9   9 1   56 positive 0.8888889 0.47058824 -0.28266561 4.0253517
## 10  8 2   65 positive 0.7500000 0.35294118  0.04061769 4.1743873
## 11  6 1  100 positive 0.8333333 0.29411765  0.20194070 4.6051702
## 12  5 1  108 positive 0.8000000 0.23529412  0.36943646 4.6821312
## 13  4 1  121 positive 0.7500000 0.17647059  0.55077745 4.7957905
## 14  3 1  134 positive 0.6666667 0.11764706  0.76083675 4.8978398
## 15  2 1  143 positive 0.5000000 0.05882353  1.04141152 4.9628446
## 16  1 1  156 positive 0.0000000 0.00000000         Inf 5.0498560
## 17 16 0    0 negative 1.0000000 1.00000000        -Inf      -Inf
## 18 16 1    2 negative 0.9375000 0.93750000 -2.74049301 0.6931472
## 19 15 3    3 negative 0.8000000 0.75000000 -1.24589932 1.0986123
## 20 12 3    4 negative 0.7500000 0.56250000 -0.55275214 1.3862944
## 21  9 1    7 negative 0.8888889 0.50000000 -0.36651292 1.9459101
## 22  8 1    8 negative 0.8750000 0.43750000 -0.19033933 2.0794415
## 23  7 1   16 negative 0.8571429 0.37500000 -0.01935689 2.7725887
## 24  6 1   17 negative 0.8333333 0.31250000  0.15113254 2.8332133
## 25  5 1   22 negative 0.8000000 0.25000000  0.32663426 3.0910425
## 26  4 1   30 negative 0.7500000 0.18750000  0.51520189 3.4011974
## 27  3 1   43 negative 0.6666667 0.12500000  0.73209937 3.7612001
## 28  2 1   56 negative 0.5000000 0.06250000  1.01978144 4.0253517
## 29  1 1   65 negative 0.0000000 0.00000000         Inf 4.1743873
survival_AG |> mutate(logH=log(-log(S)),
               logY=log(time)) |> ggplot(aes(x=logY, y=logH)) + geom_point(aes(shape = AG), size=3) + 
  theme_bw() + theme(
  # Hide panel borders and remove grid lines
  #panel.border = element_line(colour = "black"),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  # Change axis line
  #axis.line = element_line(colour = "black")
)

#the exponential model can be fitted:
res.leukemia_survival.poisson<-glm(rep(1, length(AG))~AG + offset(log(Survival.time)), family=poisson(), 
                  data=leukemia_survival)
summary(res.leukemia_survival.poisson)
## 
## Call:
## glm(formula = rep(1, length(AG)) ~ AG + offset(log(Survival.time)), 
##     family = poisson(), data = leukemia_survival)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.8869     0.2500 -11.548  < 2e-16 ***
## AGpositive   -1.2478     0.3483  -3.582  0.00034 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 58.138  on 32  degrees of freedom
## Residual deviance: 46.198  on 31  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 116.2
## 
## Number of Fisher Scoring iterations: 6
#AIC
AIC(res.leukemia_survival.poisson)
## [1] 116.1983
-2* logLik(res.leukemia_survival.poisson)+2*2
## 'log Lik.' 116.1983 (df=2)
res.leukemia_survival.weibull <-survreg(Surv(Survival.time)~AG,dist="weibull", data=leukemia_survival)
summary(res.leukemia_survival.weibull)
## 
## Call:
## survreg(formula = Surv(Survival.time) ~ AG, data = leukemia_survival, 
##     dist = "weibull")
##             Value Std. Error    z      p
## (Intercept) 2.836      0.285 9.95 <2e-16
## AGpositive  1.258      0.385 3.27 0.0011
## Log(scale)  0.100      0.141 0.71 0.4767
## 
## Scale= 1.11 
## 
## Weibull distribution
## Loglik(model)= -149.2   Loglik(intercept only)= -153.6
## 	Chisq= 8.74 on 1 degrees of freedom, p= 0.0031 
## Number of Newton-Raphson Iterations: 6 
## n=33 (1 observation deleted due to missingness)
mu <- res.leukemia_survival.poisson$fitted.values
y <- leukemia_survival$Survival.time[1:33]
m <- length(mu)
#lambda_hat
m/sum((mu-1)*log(y), na.rm = TRUE)
## [1] 0.8594624
res.leukemia_survival.weibull2 <-survreg(Surv(Survival.time)~AG+log(White.blood.cell),dist="weibull", data=leukemia_survival)
summary(res.leukemia_survival.weibull2)
## 
## Call:
## survreg(formula = Surv(Survival.time) ~ AG + log(White.blood.cell), 
##     data = leukemia_survival, dist = "weibull")
##                         Value Std. Error     z       p
## (Intercept)            3.7087     0.4727  7.85 4.3e-15
## AGpositive             1.0206     0.3781  2.70  0.0069
## log(White.blood.cell) -0.3103     0.1313 -2.36  0.0181
## Log(scale)             0.0399     0.1392  0.29  0.7745
## 
## Scale= 1.04 
## 
## Weibull distribution
## Loglik(model)= -146.5   Loglik(intercept only)= -153.6
## 	Chisq= 14.18 on 2 degrees of freedom, p= 0.00084 
## Number of Newton-Raphson Iterations: 6 
## n=33 (1 observation deleted due to missingness)
exp(-0.3103)
## [1] 0.733227

Chapter11 Clustered and Longitudinal Data

stroke.wide
## # A tibble: 24 × 10
##    Subject Group week1 week2 week3 week4 week5 week6 week7 week8
##      <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1       1 A        45    45    45    45    80    80    80    90
##  2       2 A        20    25    25    25    30    35    30    50
##  3       3 A        50    50    55    70    70    75    90    90
##  4       4 A        25    25    35    40    60    60    70    80
##  5       5 A       100   100   100   100   100   100   100   100
##  6       6 A        20    20    30    50    50    60    85    95
##  7       7 A        30    35    35    40    50    60    75    85
##  8       8 A        30    35    45    50    55    65    65    70
##  9       9 B        40    55    60    70    80    85    90    90
## 10      10 B        65    65    70    70    80    80    80    80
## 11      11 B        30    30    40    45    65    85    85    85
## 12      12 B        25    35    35    35    40    45    45    45
## 13      13 B        45    45    80    80    80    80    80    80
## 14      14 B        15    15    10    10    10    20    20    20
## 15      15 B        35    35    35    45    45    45    50    50
## 16      16 B        40    40    40    55    55    55    60    65
## 17      17 C        20    20    30    30    30    30    30    30
## 18      18 C        35    35    35    40    40    40    40    40
## 19      19 C        35    35    35    40    40    40    45    45
## 20      20 C        45    65    65    65    80    85    95   100
## 21      21 C        45    65    70    90    90    95    95   100
## 22      22 C        25    30    30    35    40    40    40    40
## 23      23 C        25    25    30    30    30    30    35    40
## 24      24 C        15    35    35    35    40    50    65    65
library(tidyverse)
stroke.wide |> pivot_longer(
    cols = starts_with("week"), 
    names_to = "week", 
    values_to = "score",
    values_drop_na = TRUE
  ) |> 
  mutate(
    week = readr::parse_number(week)
  )
## # A tibble: 192 × 4
##    Subject Group  week score
##      <dbl> <chr> <dbl> <dbl>
##  1       1 A         1    45
##  2       1 A         2    45
##  3       1 A         3    45
##  4       1 A         4    45
##  5       1 A         5    80
##  6       1 A         6    80
##  7       1 A         7    80
##  8       1 A         8    90
##  9       2 A         1    20
## 10       2 A         2    25
## 11       2 A         3    25
## 12       2 A         4    25
## 13       2 A         5    30
## 14       2 A         6    35
## 15       2 A         7    30
## 16       2 A         8    50
## 17       3 A         1    50
## 18       3 A         2    50
## 19       3 A         3    55
## 20       3 A         4    70
## 21       3 A         5    70
## 22       3 A         6    75
## 23       3 A         7    90
## 24       3 A         8    90
## 25       4 A         1    25
## # ℹ 167 more rows
stroke.long <- stroke.wide |> pivot_longer(
    cols = starts_with("week"), 
    names_to = "week", 
    values_to = "score",
    values_drop_na = TRUE
  ) |> 
  mutate(
    week = readr::parse_number(week)
  )
stroke.long
## # A tibble: 192 × 4
##    Subject Group  week score
##      <dbl> <chr> <dbl> <dbl>
##  1       1 A         1    45
##  2       1 A         2    45
##  3       1 A         3    45
##  4       1 A         4    45
##  5       1 A         5    80
##  6       1 A         6    80
##  7       1 A         7    80
##  8       1 A         8    90
##  9       2 A         1    20
## 10       2 A         2    25
## 11       2 A         3    25
## 12       2 A         4    25
## 13       2 A         5    30
## 14       2 A         6    35
## 15       2 A         7    30
## 16       2 A         8    50
## 17       3 A         1    50
## 18       3 A         2    50
## 19       3 A         3    55
## 20       3 A         4    70
## 21       3 A         5    70
## 22       3 A         6    75
## 23       3 A         7    90
## 24       3 A         8    90
## 25       4 A         1    25
## # ℹ 167 more rows
stroke.long |> ggplot(aes(x=week, y=score, group=Subject)) +
  geom_line(aes(color = Group, linetype = Group), size=1) + theme_bw() + theme(
  # Hide panel borders and remove grid lines
  #panel.border = element_line(colour = "black"),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  # Change axis line
  #axis.line = element_line(colour = "black")
)

stroke.long |> 
  group_by(Group,week) |> 
  summarize(average=mean(score, na.rm = TRUE),
            .groups = "drop") |> 
  ggplot(aes(x=week, y=average)) +
  geom_line(aes(color = Group, linetype = Group), size=1) + theme_bw() + theme(
  # Hide panel borders and remove grid lines
  #panel.border = element_line(colour = "black"),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  # Change axis line
  #axis.line = element_line(colour = "black")
)

stroke.wide
## # A tibble: 24 × 10
##    Subject Group week1 week2 week3 week4 week5 week6 week7 week8
##      <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1       1 A        45    45    45    45    80    80    80    90
##  2       2 A        20    25    25    25    30    35    30    50
##  3       3 A        50    50    55    70    70    75    90    90
##  4       4 A        25    25    35    40    60    60    70    80
##  5       5 A       100   100   100   100   100   100   100   100
##  6       6 A        20    20    30    50    50    60    85    95
##  7       7 A        30    35    35    40    50    60    75    85
##  8       8 A        30    35    45    50    55    65    65    70
##  9       9 B        40    55    60    70    80    85    90    90
## 10      10 B        65    65    70    70    80    80    80    80
## 11      11 B        30    30    40    45    65    85    85    85
## 12      12 B        25    35    35    35    40    45    45    45
## 13      13 B        45    45    80    80    80    80    80    80
## 14      14 B        15    15    10    10    10    20    20    20
## 15      15 B        35    35    35    45    45    45    50    50
## 16      16 B        40    40    40    55    55    55    60    65
## 17      17 C        20    20    30    30    30    30    30    30
## 18      18 C        35    35    35    40    40    40    40    40
## 19      19 C        35    35    35    40    40    40    45    45
## 20      20 C        45    65    65    65    80    85    95   100
## 21      21 C        45    65    70    90    90    95    95   100
## 22      22 C        25    30    30    35    40    40    40    40
## 23      23 C        25    25    30    30    30    30    35    40
## 24      24 C        15    35    35    35    40    50    65    65
pairs(stroke.wide[, 3:10], main = "Scatter Plot Matrix for stroke Dataset")

library(ggplot2) 
library(GGally) 
ggpairs(stroke.wide[, 3:10], 
        title = "Scatter Plot Matrix for stroke Dataset", 
        axisLabels = "show")

library("gpairs")
gpairs(data.frame(stroke.wide[, 3:10]))

library(ggplot2)
ggcorr(stroke.wide[, 3:10],
       low = "#3B9AB2",
       mid = "#EEEEEE",
       high = "#F21A00",
       midpoint = 0.75,
       limits = c(0.5, 1))

cor(stroke.wide[, 3:10], method = c("pearson"))
##           week1     week2     week3     week4     week5     week6     week7
## week1 1.0000000 0.9280364 0.8820153 0.8306544 0.7936646 0.7125630 0.6163514
## week2 0.9280364 1.0000000 0.9225559 0.8774061 0.8466833 0.7895900 0.7041487
## week3 0.8820153 0.9225559 1.0000000 0.9530914 0.9092148 0.8542616 0.7667288
## week4 0.8306544 0.8774061 0.9530914 1.0000000 0.9215159 0.8786341 0.8313352
## week5 0.7936646 0.8466833 0.9092148 0.9215159 1.0000000 0.9734304 0.9149511
## week6 0.7125630 0.7895900 0.8542616 0.8786341 0.9734304 1.0000000 0.9569344
## week7 0.6163514 0.7041487 0.7667288 0.8313352 0.9149511 0.9569344 1.0000000
## week8 0.5544246 0.6425992 0.7007907 0.7716004 0.8819552 0.9266933 0.9776126
##           week8
## week1 0.5544246
## week2 0.6425992
## week3 0.7007907
## week4 0.7716004
## week5 0.8819552
## week6 0.9266933
## week7 0.9776126
## week8 1.0000000
#model 11.1
#Table11.3
lmodel1 <- lm(score ~ Group+week, data = stroke.long)
summary(lmodel1)
## 
## Call:
## lm(formula = score ~ Group + week, data = stroke.long)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.332 -13.907  -4.532  15.043  58.394 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.8415     3.9712   9.277  < 2e-16 ***
## GroupB       -5.6250     3.7147  -1.514  0.13164    
## GroupC      -12.1094     3.7147  -3.260  0.00132 ** 
## week          4.7644     0.6619   7.198 1.42e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.01 on 188 degrees of freedom
## Multiple R-squared:  0.2494,	Adjusted R-squared:  0.2374 
## F-statistic: 20.82 on 3 and 188 DF,  p-value: 1.084e-11
#model 11.2
lmodel2 <- lm(score ~ Group*week, data = stroke.long)
summary(lmodel2)
## 
## Call:
## lm(formula = score ~ Group * week, data = stroke.long)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.813 -13.560  -5.305  13.337  63.854 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.82143    5.77401   5.165 6.16e-07 ***
## GroupB       3.34821    8.16569   0.410   0.6823    
## GroupC      -0.02232    8.16569  -0.003   0.9978    
## week         6.32440    1.14342   5.531 1.07e-07 ***
## GroupB:week -1.99405    1.61705  -1.233   0.2191    
## GroupC:week -2.68601    1.61705  -1.661   0.0984 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.96 on 186 degrees of freedom
## Multiple R-squared:  0.2612,	Adjusted R-squared:  0.2413 
## F-statistic: 13.15 on 5 and 186 DF,  p-value: 5.695e-11
#Estimates of intercepts and slopes (and their standard errors) for each subject
#model 11.3, table 11.4
library(dplyr)
library(broom)
stroke.long |> group_by(Subject) |> 
  do(model = lm(score ~ week, data = .)) |>
  pull(model)
## [[1]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##        30.0          7.5  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      15.536        3.214  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      39.821        6.429  
## 
## 
## [[4]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      11.607        8.393  
## 
## 
## [[5]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##         100            0  
## 
## 
## [[6]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      0.8929      11.1905  
## 
## 
## [[7]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      15.357        7.976  
## 
## 
## [[8]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      25.357        5.893  
## 
## 
## [[9]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      38.571        7.262  
## 
## 
## [[10]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      61.964        2.619  
## 
## 
## [[11]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      14.464        9.702  
## 
## 
## [[12]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      26.071        2.679  
## 
## 
## [[13]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##       48.75         5.00  
## 
## 
## [[14]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      10.179        1.071  
## 
## 
## [[15]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##       31.25         2.50  
## 
## 
## [[16]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##       34.11         3.81  
## 
## 
## [[17]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      21.071        1.429  
## 
## 
## [[18]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##     34.1071       0.8929  
## 
## 
## [[19]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      32.143        1.607  
## 
## 
## [[20]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      42.321        7.262  
## 
## 
## [[21]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      48.571        7.262  
## 
## 
## [[22]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      24.821        2.262  
## 
## 
## [[23]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      22.321        1.845  
## 
## 
## [[24]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Coefficients:
## (Intercept)         week  
##      13.036        6.548
stroke.long |> group_by(Subject) |> 
  do(model = lm(score ~ week, data = .)) |> 
  pull(model) |> purrr::map(summary)
## [[1]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.000  -3.750   0.000   5.625  12.500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   30.000      7.289   4.116  0.00624 **
## week           7.500      1.443   5.196  0.00202 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.354 on 6 degrees of freedom
## Multiple R-squared:  0.8182,	Adjusted R-squared:  0.7879 
## F-statistic:    27 on 1 and 6 DF,  p-value: 0.002022
## 
## 
## [[2]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.036 -2.054  0.000  1.696  8.750 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  15.5357     4.0994   3.790  0.00908 **
## week          3.2143     0.8118   3.959  0.00746 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.261 on 6 degrees of freedom
## Multiple R-squared:  0.7232,	Adjusted R-squared:  0.6771 
## F-statistic: 15.68 on 1 and 6 DF,  p-value: 0.007456
## 
## 
## [[3]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.107 -2.857 -1.607  3.929  5.179 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  39.8214     3.2093   12.41 1.67e-05 ***
## week          6.4286     0.6355   10.12 5.43e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.119 on 6 degrees of freedom
## Multiple R-squared:  0.9446,	Adjusted R-squared:  0.9354 
## F-statistic: 102.3 on 1 and 6 DF,  p-value: 5.426e-05
## 
## 
## [[4]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.179 -2.321 -1.071  2.188  6.429 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11.6071     3.3874   3.427    0.014 *  
## week          8.3929     0.6708  12.512 1.59e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.347 on 6 degrees of freedom
## Multiple R-squared:  0.9631,	Adjusted R-squared:  0.9569 
## F-statistic: 156.5 on 1 and 6 DF,  p-value: 1.594e-05
## 
## 
## [[5]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##      0      0      0      0      0 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      100          0     Inf   <2e-16 ***
## week               0          0     NaN      NaN    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0 on 6 degrees of freedom
## Multiple R-squared:    NaN,	Adjusted R-squared:    NaN 
## F-statistic:   NaN on 1 and 6 DF,  p-value: NA
## 
## 
## [[6]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.0357 -5.0595  0.5357  4.8810  7.9167 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.8929     5.3036   0.168    0.872    
## week         11.1905     1.0503  10.655 4.03e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.807 on 6 degrees of freedom
## Multiple R-squared:  0.9498,	Adjusted R-squared:  0.9414 
## F-statistic: 113.5 on 1 and 6 DF,  p-value: 4.03e-05
## 
## 
## [[7]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2619 -4.5238  0.2381  4.3155  6.6667 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  15.3571     4.6695   3.289 0.016638 *  
## week          7.9762     0.9247   8.626 0.000134 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.993 on 6 degrees of freedom
## Multiple R-squared:  0.9254,	Adjusted R-squared:  0.9129 
## F-statistic:  74.4 on 1 and 6 DF,  p-value: 0.0001337
## 
## 
## [[8]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5000 -1.7411 -0.5357  1.2946  4.2857 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.3571     1.9710   12.87 1.36e-05 ***
## week          5.8929     0.3903   15.10 5.32e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.53 on 6 degrees of freedom
## Multiple R-squared:  0.9744,	Adjusted R-squared:  0.9701 
## F-statistic: 227.9 on 1 and 6 DF,  p-value: 5.324e-06
## 
## 
## [[9]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.667 -1.726  1.250  2.500  5.119 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  38.5714     3.5225   10.95 3.44e-05 ***
## week          7.2619     0.6976   10.41 4.60e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.521 on 6 degrees of freedom
## Multiple R-squared:  0.9475,	Adjusted R-squared:  0.9388 
## F-statistic: 108.4 on 1 and 6 DF,  p-value: 4.603e-05
## 
## 
## [[10]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9167 -2.2619 -0.0595  0.8929  4.9405 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  61.9643     2.2359  27.713 1.46e-07 ***
## week          2.6190     0.4428   5.915  0.00104 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.87 on 6 degrees of freedom
## Multiple R-squared:  0.8536,	Adjusted R-squared:  0.8292 
## F-statistic: 34.99 on 1 and 6 DF,  p-value: 0.001039
## 
## 
## [[11]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.2738 -4.6726 -0.7738  3.4226 12.3214 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   14.464      5.893   2.455 0.049480 *  
## week           9.702      1.167   8.314 0.000164 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.563 on 6 degrees of freedom
## Multiple R-squared:  0.9201,	Adjusted R-squared:  0.9068 
## F-statistic: 69.13 on 1 and 6 DF,  p-value: 0.0001642
## 
## 
## [[12]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7500 -1.9643  0.3571  1.3839  3.5714 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  26.0714     2.1466  12.146 1.89e-05 ***
## week          2.6786     0.4251   6.301 0.000745 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.755 on 6 degrees of freedom
## Multiple R-squared:  0.8687,	Adjusted R-squared:  0.8468 
## F-statistic: 39.71 on 1 and 6 DF,  p-value: 0.0007448
## 
## 
## [[13]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -13.75  -8.75  -1.25   7.50  16.25 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   48.750      8.927   5.461  0.00157 **
## week           5.000      1.768   2.828  0.03002 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.46 on 6 degrees of freedom
## Multiple R-squared:  0.5714,	Adjusted R-squared:    0.5 
## F-statistic:     8 on 1 and 6 DF,  p-value: 0.03002
## 
## 
## [[14]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.536 -3.661  1.786  2.857  3.750 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  10.1786     3.2093   3.172   0.0193 *
## week          1.0714     0.6355   1.686   0.1428  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.119 on 6 degrees of freedom
## Multiple R-squared:  0.3214,	Adjusted R-squared:  0.2083 
## F-statistic: 2.842 on 1 and 6 DF,  p-value: 0.1428
## 
## 
## [[15]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -3.75  -1.25   0.00   1.25   3.75 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  31.2500     1.9480  16.042 3.73e-06 ***
## week          2.5000     0.3858   6.481 0.000641 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.5 on 6 degrees of freedom
## Multiple R-squared:  0.875,	Adjusted R-squared:  0.8542 
## F-statistic:    42 on 1 and 6 DF,  p-value: 0.0006413
## 
## 
## [[16]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5357 -1.7857 -0.1786  1.9048  5.6548 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  34.1071     2.8090  12.142  1.9e-05 ***
## week          3.8095     0.5563   6.848 0.000477 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.605 on 6 degrees of freedom
## Multiple R-squared:  0.8866,	Adjusted R-squared:  0.8677 
## F-statistic:  46.9 on 1 and 6 DF,  p-value: 0.0004768
## 
## 
## [[17]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9286 -2.5000 -0.3571  2.1429  4.6429 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  21.0714     2.5505   8.262  0.00017 ***
## week          1.4286     0.5051   2.828  0.03002 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.273 on 6 degrees of freedom
## Multiple R-squared:  0.5714,	Adjusted R-squared:    0.5 
## F-statistic:     8 on 1 and 6 DF,  p-value: 0.03002
## 
## 
## [[18]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7857 -0.9821 -0.1786  0.7589  2.3214 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  34.1071     1.1641  29.298 1.05e-07 ***
## week          0.8929     0.2305   3.873  0.00824 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.494 on 6 degrees of freedom
## Multiple R-squared:  0.7143,	Adjusted R-squared:  0.6667 
## F-statistic:    15 on 1 and 6 DF,  p-value: 0.008237
## 
## 
## [[19]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.96429 -0.71429 -0.08929  1.29464  1.60714 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  32.1429     1.1641  27.611 1.49e-07 ***
## week          1.6071     0.2305   6.971 0.000433 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.494 on 6 degrees of freedom
## Multiple R-squared:  0.8901,	Adjusted R-squared:  0.8718 
## F-statistic:  48.6 on 1 and 6 DF,  p-value: 0.000433
## 
## 
## [[20]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3690 -1.8155  0.2381  1.4881  8.1548 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  42.3214     3.6977  11.445 2.67e-05 ***
## week          7.2619     0.7322   9.917 6.07e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.746 on 6 degrees of freedom
## Multiple R-squared:  0.9425,	Adjusted R-squared:  0.9329 
## F-statistic: 98.35 on 1 and 6 DF,  p-value: 6.073e-05
## 
## 
## [[21]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.8333  -4.9702   0.7738   3.4226  12.3810 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   48.571      6.140   7.910 0.000217 ***
## week           7.262      1.216   5.972 0.000989 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.881 on 6 degrees of freedom
## Multiple R-squared:  0.856,	Adjusted R-squared:  0.832 
## F-statistic: 35.66 on 1 and 6 DF,  p-value: 0.0009885
## 
## 
## [[22]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.917 -1.726  0.000  1.250  3.869 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  24.8214     1.8851  13.167 1.18e-05 ***
## week          2.2619     0.3733   6.059 0.000916 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.419 on 6 degrees of freedom
## Multiple R-squared:  0.8595,	Adjusted R-squared:  0.8361 
## F-statistic: 36.71 on 1 and 6 DF,  p-value: 0.0009162
## 
## 
## [[23]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3929 -1.1458  0.0298  1.1607  2.9167 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.3214     1.7092  13.060 1.24e-05 ***
## week          1.8452     0.3385   5.452  0.00158 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.194 on 6 degrees of freedom
## Multiple R-squared:  0.832,	Adjusted R-squared:  0.804 
## F-statistic: 29.72 on 1 and 6 DF,  p-value: 0.001585
## 
## 
## [[24]]
## 
## Call:
## lm(formula = score ~ week, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.774 -4.316 -1.369  3.274  8.869 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  13.0357     4.4920   2.902 0.027263 *  
## week          6.5476     0.8895   7.361 0.000322 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.765 on 6 degrees of freedom
## Multiple R-squared:  0.9003,	Adjusted R-squared:  0.8837 
## F-statistic: 54.18 on 1 and 6 DF,  p-value: 0.000322
library(tidyverse)
models <- stroke.long |> group_by(Subject) |> 
  do(model = lm(score ~ week, data = .)) |> 
  pull(model) |> purrr::map(tidy)
models[[1]][,1:3]
## # A tibble: 2 × 3
##   term        estimate std.error
##   <chr>          <dbl>     <dbl>
## 1 (Intercept)     30        7.29
## 2 week             7.5      1.44
models[[1]][,1:3] |> pivot_wider(names_from = term, values_from = c(estimate, std.error))
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                     30           7.5                    7.29           1.44
#Table11.4
models2 <- models |> purrr::map(\(table) table[,1:3]) |> 
  map(~ (pivot_wider(.x, names_from=1,values_from= c(2,3))))
models2
## [[1]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                     30           7.5                    7.29           1.44
## 
## [[2]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   15.5          3.21                    4.10          0.812
## 
## [[3]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   39.8          6.43                    3.21          0.636
## 
## [[4]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   11.6          8.39                    3.39          0.671
## 
## [[5]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                    100             0                       0              0
## 
## [[6]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                  0.893          11.2                    5.30           1.05
## 
## [[7]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   15.4          7.98                    4.67          0.925
## 
## [[8]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   25.4          5.89                    1.97          0.390
## 
## [[9]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   38.6          7.26                    3.52          0.698
## 
## [[10]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   62.0          2.62                    2.24          0.443
## 
## [[11]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   14.5          9.70                    5.89           1.17
## 
## [[12]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   26.1          2.68                    2.15          0.425
## 
## [[13]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   48.8             5                    8.93           1.77
## 
## [[14]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   10.2          1.07                    3.21          0.636
## 
## [[15]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   31.3           2.5                    1.95          0.386
## 
## [[16]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   34.1          3.81                    2.81          0.556
## 
## [[17]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   21.1          1.43                    2.55          0.505
## 
## [[18]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   34.1         0.893                    1.16          0.231
## 
## [[19]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   32.1          1.61                    1.16          0.231
## 
## [[20]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   42.3          7.26                    3.70          0.732
## 
## [[21]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   48.6          7.26                    6.14           1.22
## 
## [[22]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   24.8          2.26                    1.89          0.373
## 
## [[23]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   22.3          1.85                    1.71          0.338
## 
## [[24]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_week `std.error_(Intercept)` std.error_week
##                    <dbl>         <dbl>                   <dbl>          <dbl>
## 1                   13.0          6.55                    4.49          0.890
#Table 11.4
names(models2) <- seq(1:24)
bind_rows(models2, .id = "Subject")
## # A tibble: 24 × 5
##    Subject `estimate_(Intercept)` estimate_week `std.error_(Intercept)`
##    <chr>                    <dbl>         <dbl>                   <dbl>
##  1 1                       30             7.5                      7.29
##  2 2                       15.5           3.21                     4.10
##  3 3                       39.8           6.43                     3.21
##  4 4                       11.6           8.39                     3.39
##  5 5                      100             0                        0   
##  6 6                        0.893        11.2                      5.30
##  7 7                       15.4           7.98                     4.67
##  8 8                       25.4           5.89                     1.97
##  9 9                       38.6           7.26                     3.52
## 10 10                      62.0           2.62                     2.24
## 11 11                      14.5           9.70                     5.89
## 12 12                      26.1           2.68                     2.15
## 13 13                      48.8           5                        8.93
## 14 14                      10.2           1.07                     3.21
## 15 15                      31.3           2.5                      1.95
## 16 16                      34.1           3.81                     2.81
## 17 17                      21.1           1.43                     2.55
## 18 18                      34.1           0.893                    1.16
## 19 19                      32.1           1.61                     1.16
## 20 20                      42.3           7.26                     3.70
## 21 21                      48.6           7.26                     6.14
## 22 22                      24.8           2.26                     1.89
## 23 23                      22.3           1.85                     1.71
## 24 24                      13.0           6.55                     4.49
## # ℹ 1 more variable: std.error_week <dbl>
models3 <- bind_rows(models2, .id = "Subject")
models3["Groups"] <- factor(c(rep("A", 8), rep("B", 8), rep("C", 8)))
models3
## # A tibble: 24 × 6
##    Subject `estimate_(Intercept)` estimate_week `std.error_(Intercept)`
##    <chr>                    <dbl>         <dbl>                   <dbl>
##  1 1                       30             7.5                      7.29
##  2 2                       15.5           3.21                     4.10
##  3 3                       39.8           6.43                     3.21
##  4 4                       11.6           8.39                     3.39
##  5 5                      100             0                        0   
##  6 6                        0.893        11.2                      5.30
##  7 7                       15.4           7.98                     4.67
##  8 8                       25.4           5.89                     1.97
##  9 9                       38.6           7.26                     3.52
## 10 10                      62.0           2.62                     2.24
## 11 11                      14.5           9.70                     5.89
## 12 12                      26.1           2.68                     2.15
## 13 13                      48.8           5                        8.93
## 14 14                      10.2           1.07                     3.21
## 15 15                      31.3           2.5                      1.95
## 16 16                      34.1           3.81                     2.81
## 17 17                      21.1           1.43                     2.55
## 18 18                      34.1           0.893                    1.16
## 19 19                      32.1           1.61                     1.16
## 20 20                      42.3           7.26                     3.70
## 21 21                      48.6           7.26                     6.14
## 22 22                      24.8           2.26                     1.89
## 23 23                      22.3           1.85                     1.71
## 24 24                      13.0           6.55                     4.49
## # ℹ 2 more variables: std.error_week <dbl>, Groups <fct>
#Table 11.5 Analysis of variance of intercept estimates
summary(lm(`estimate_(Intercept)` ~ Groups, data = models3))
## 
## Call:
## lm(formula = `estimate_(Intercept)` ~ Groups, data = models3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.929 -14.330  -3.192   6.551  70.179 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.82143    7.57209   3.938 0.000753 ***
## GroupsB      3.34821   10.70855   0.313 0.757617    
## GroupsC     -0.02232   10.70855  -0.002 0.998357    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.42 on 21 degrees of freedom
## Multiple R-squared:  0.00621,	Adjusted R-squared:  -0.08844 
## F-statistic: 0.06561 on 2 and 21 DF,  p-value: 0.9367
#Table 11.6 Analysis of variance of slope estimates
summary(lm(estimate_week ~ Groups, data = models3))
## 
## Call:
## lm(formula = estimate_week ~ Groups, data = models3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3244 -1.8806 -0.4762  2.2786  5.3720 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.324      1.080   5.854 8.25e-06 ***
## GroupsB       -1.994      1.528  -1.305   0.2060    
## GroupsC       -2.686      1.528  -1.758   0.0933 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.056 on 21 degrees of freedom
## Multiple R-squared:  0.1369,	Adjusted R-squared:  0.05475 
## F-statistic: 1.666 on 2 and 21 DF,  p-value: 0.213
stroke.long <- stroke.wide |> pivot_longer(
    cols = starts_with("week"), 
    names_to = "week", 
    values_to = "score",
    values_drop_na = TRUE
  ) |> 
  mutate(
    week = readr::parse_number(week),
    Group = factor(Group)
  )
colnames(stroke.long) <- c("Subject","Group","time","ability")
stroke.long
## # A tibble: 192 × 4
##    Subject Group  time ability
##      <dbl> <fct> <dbl>   <dbl>
##  1       1 A         1      45
##  2       1 A         2      45
##  3       1 A         3      45
##  4       1 A         4      45
##  5       1 A         5      80
##  6       1 A         6      80
##  7       1 A         7      80
##  8       1 A         8      90
##  9       2 A         1      20
## 10       2 A         2      25
## 11       2 A         3      25
## 12       2 A         4      25
## 13       2 A         5      30
## 14       2 A         6      35
## 15       2 A         7      30
## 16       2 A         8      50
## 17       3 A         1      50
## 18       3 A         2      50
## 19       3 A         3      55
## 20       3 A         4      70
## 21       3 A         5      70
## 22       3 A         6      75
## 23       3 A         7      90
## 24       3 A         8      90
## 25       4 A         1      25
## # ℹ 167 more rows
#model 11.2 pooled
lmodel2 <- glm(ability ~ Group*time, data = stroke.long)
summary(lmodel2)
## 
## Call:
## glm(formula = ability ~ Group * time, data = stroke.long)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.82143    5.77401   5.165 6.16e-07 ***
## GroupB       3.34821    8.16569   0.410   0.6823    
## GroupC      -0.02232    8.16569  -0.003   0.9978    
## time         6.32440    1.14342   5.531 1.07e-07 ***
## GroupB:time -1.99405    1.61705  -1.233   0.2191    
## GroupC:time -2.68601    1.61705  -1.661   0.0984 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 439.2931)
## 
##     Null deviance: 110597  on 191  degrees of freedom
## Residual deviance:  81709  on 186  degrees of freedom
## AIC: 1721.1
## 
## Number of Fisher Scoring iterations: 2

The estimate of \(\sigma_e\) is square root of the deviance divided by the degrees of freedom

lmodel2$df.residual
## [1] 186
lmodel2$deviance
## [1] 81708.52
#The estimate of \sigma_e is
sqrt(lmodel2$deviance/lmodel2$df.residual)
## [1] 20.95932
#the deviance
sum((lmodel2$fitted.values - lmodel2$data$ability)^2)
## [1] 81708.52
#Data reduction
summary(lm(`estimate_(Intercept)` ~ Groups, data = models3))
## 
## Call:
## lm(formula = `estimate_(Intercept)` ~ Groups, data = models3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.929 -14.330  -3.192   6.551  70.179 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.82143    7.57209   3.938 0.000753 ***
## GroupsB      3.34821   10.70855   0.313 0.757617    
## GroupsC     -0.02232   10.70855  -0.002 0.998357    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.42 on 21 degrees of freedom
## Multiple R-squared:  0.00621,	Adjusted R-squared:  -0.08844 
## F-statistic: 0.06561 on 2 and 21 DF,  p-value: 0.9367
summary(lm(estimate_week ~ Groups, data = models3))
## 
## Call:
## lm(formula = estimate_week ~ Groups, data = models3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3244 -1.8806 -0.4762  2.2786  5.3720 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.324      1.080   5.854 8.25e-06 ***
## GroupsB       -1.994      1.528  -1.305   0.2060    
## GroupsC       -2.686      1.528  -1.758   0.0933 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.056 on 21 degrees of freedom
## Multiple R-squared:  0.1369,	Adjusted R-squared:  0.05475 
## F-statistic: 1.666 on 2 and 21 DF,  p-value: 0.213
library(geepack)
#GEE independent
gee.ind<-geeglm(ability~Group*time,
                family=gaussian("identity"), 
                data=stroke.long,
                id=Subject,
                wave=time,
                std.err = 'san.se',
                corst="independence")
summary(gee.ind)
## 
## Call:
## geeglm(formula = ability ~ Group * time, family = gaussian("identity"), 
##     data = stroke.long, id = Subject, waves = time, corstr = "independence", 
##     std.err = "san.se")
## 
##  Coefficients:
##             Estimate  Std.err   Wald Pr(>|W|)    
## (Intercept) 29.82143 10.17573  8.589  0.00338 ** 
## GroupB       3.34821 11.63333  0.083  0.77349    
## GroupC      -0.02232 10.89585  0.000  0.99837    
## time         6.32440  1.13148 31.243 2.28e-08 ***
## GroupB:time -1.99405  1.47762  1.821  0.17718    
## GroupC:time -2.68601  1.47010  3.338  0.06769 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)    425.6   103.2
## Number of clusters:   24  Maximum cluster size: 8
gee.ind$coefficients
## (Intercept)      GroupB      GroupC        time GroupB:time GroupC:time 
##    29.82143     3.34821    -0.02232     6.32440    -1.99405    -2.68601
summary(gee.ind)$coefficients
##             Estimate Std.err      Wald  Pr(>|W|)
## (Intercept) 29.82143  10.176 8.589e+00 3.383e-03
## GroupB       3.34821  11.633 8.284e-02 7.735e-01
## GroupC      -0.02232  10.896 4.197e-06 9.984e-01
## time         6.32440   1.131 3.124e+01 2.277e-08
## GroupB:time -1.99405   1.478 1.821e+00 1.772e-01
## GroupC:time -2.68601   1.470 3.338e+00 6.769e-02
#GEE exchangeable
gee.exch<-geeglm(ability~Group*time,family=gaussian, 
                 std.err = 'san.se',
                 data=stroke.long,
                 id=Subject,
                 wave=time,
                 corst="exchangeable")
summary(gee.exch)
## 
## Call:
## geeglm(formula = ability ~ Group * time, family = gaussian, data = stroke.long, 
##     id = Subject, waves = time, corstr = "exchangeable", std.err = "san.se")
## 
##  Coefficients:
##             Estimate Std.err  Wald Pr(>|W|)    
## (Intercept)  29.8214 10.1757  8.59   0.0034 ** 
## GroupB        3.3482 11.6333  0.08   0.7735    
## GroupC       -0.0223 10.8958  0.00   0.9984    
## time          6.3244  1.1315 31.24  2.3e-08 ***
## GroupB:time  -1.9940  1.4776  1.82   0.1772    
## GroupC:time  -2.6860  1.4701  3.34   0.0677 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = exchangeable 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)      426     103
##   Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha    0.831   0.035
## Number of clusters:   24  Maximum cluster size: 8
gee.exch$cor.link
## [1] "identity"
#the df
gee.exch$df.residual
## [1] 186
#the deviance
sum((gee.exch$fitted.values - gee.exch$data$ability)^2)
## [1] 81709
#The estimate of \sigma_e is
sqrt(sum((gee.exch$fitted.values - gee.exch$data$ability)^2)/gee.exch$df.residual)
## [1] 21
#an autoregressive model of order 1, AR(1),
gee.ar1<-geeglm(ability~Group*time,
                family=gaussian,
                std.err = 'san.se',
                data=stroke.long,
                id=Subject,
                wave=time,
                corst="ar1")
summary(gee.ar1)
## 
## Call:
## geeglm(formula = ability ~ Group * time, family = gaussian, data = stroke.long, 
##     id = Subject, waves = time, corstr = "ar1", std.err = "san.se")
## 
##  Coefficients:
##             Estimate Std.err  Wald Pr(>|W|)    
## (Intercept)   33.239   9.757 11.61  0.00066 ***
## GroupB         0.121  10.957  0.00  0.99122    
## GroupC        -5.960  10.376  0.33  0.56566    
## time           6.077   1.035 34.45  4.4e-09 ***
## GroupB:time   -2.139   1.333  2.58  0.10854    
## GroupC:time   -2.243   1.472  2.32  0.12756    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = ar1 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)      430     103
##   Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha    0.933  0.0158
## Number of clusters:   24  Maximum cluster size: 8
#the df
gee.ar1$df.residual
## [1] 186
#the deviance
sum((gee.ar1$fitted.values - gee.ar1$data$ability)^2)
## [1] 82465
#The estimate of \sigma_e is
sqrt(sum((gee.ar1$fitted.values - gee.ar1$data$ability)^2)/gee.ar1$df.residual)
## [1] 21.1
summary(gee.ar1)$coefficients
##             Estimate Std.err     Wald Pr(>|W|)
## (Intercept)   33.239    9.76 1.16e+01 6.58e-04
## GroupB         0.121   10.96 1.21e-04 9.91e-01
## GroupC        -5.960   10.38 3.30e-01 5.66e-01
## time           6.077    1.04 3.44e+01 4.38e-09
## GroupB:time   -2.139    1.33 2.58e+00 1.09e-01
## GroupC:time   -2.243    1.47 2.32e+00 1.28e-01
#GEE unstructured
gee.un<-geeglm(ability~Group*time,
               family=gaussian,
               data=stroke.long,
               id=Subject,
               wave=time,
               corst="unstructured")
summary(gee.un)
## 
## Call:
## geeglm(formula = ability ~ Group * time, family = gaussian, data = stroke.long, 
##     id = Subject, waves = time, corstr = "unstructured")
## 
##  Coefficients:
##             Estimate  Std.err Wald Pr(>|W|)
## (Intercept)   27.536 1727.066    0     0.99
## GroupB         0.918 2222.037    0     1.00
## GroupC         1.549 2010.530    0     1.00
## time           6.879  469.581    0     0.99
## GroupB:time   -1.423  603.770    0     1.00
## GroupC:time   -3.073  546.696    0     1.00
## 
## Correlation structure = unstructured 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)      428    2667
##   Link = identity 
## 
## Estimated Correlation Parameters:
##           Estimate Std.err
## alpha.1:2    0.713   15.87
## alpha.1:3    0.708   12.86
## alpha.1:4    0.694   10.97
## alpha.1:5    0.692   15.82
## alpha.1:6    0.601   10.88
## alpha.1:7    0.534    7.01
## alpha.1:8    0.462   24.01
## alpha.2:3    0.793    8.98
## alpha.2:4    0.796    9.58
## alpha.2:5    0.805    9.89
## alpha.2:6    0.746    7.69
## alpha.2:7    0.702    5.42
## alpha.2:8    0.638   22.63
## alpha.3:4    0.920   13.66
## alpha.3:5    0.914   10.99
## alpha.3:6    0.854   12.63
## alpha.3:7    0.810    7.49
## alpha.3:8    0.737   20.52
## alpha.4:5    0.972   11.63
## alpha.4:6    0.922   14.14
## alpha.4:7    0.922    9.75
## alpha.4:8    0.850   20.73
## alpha.5:6    1.046   10.89
## alpha.5:7    1.032    2.52
## alpha.5:8    0.979   19.20
## alpha.6:7    1.072    8.55
## alpha.6:8    1.023   20.63
## alpha.7:8    1.145   30.29
## Number of clusters:   24  Maximum cluster size: 8
#the df
gee.un$df.residual
## [1] 186
#the deviance
sum((gee.un$fitted.values - gee.un$data$ability)^2)
## [1] 82257
#The estimate of \sigma_e is
sqrt(sum((gee.un$fitted.values - gee.un$data$ability)^2)/gee.un$df.residual)
## [1] 21
  • nlme::lme: this package uses a “within-between” or parameter-counting approach, which is adequate for simpler model structures (single-term or nested models), but which gets progressively less applicable/harder for random-slopes models (see comments in the GLMM FAQ), crossed random effects, unbalanced designs, or more complex random effect structures where the random effects themselves are no longer iid
#random effects models
library(nlme)
rndeff<-lme(ability~Group*time,
            data=stroke.long,
            random=~1|Subject)
summary(rndeff)
## Linear mixed-effects model fit by REML
##   Data: stroke.long 
##    AIC  BIC logLik
##   1453 1479   -718
## 
## Random effects:
##  Formula: ~1 | Subject
##         (Intercept) Residual
## StdDev:        20.1     8.56
## 
## Fixed effects:  ability ~ Group * time 
##             Value Std.Error  DF t-value p-value
## (Intercept) 29.82      7.50 165    3.98  0.0001
## GroupB       3.35     10.60  21    0.32  0.7553
## GroupC      -0.02     10.60  21    0.00  0.9983
## time         6.32      0.47 165   13.54  0.0000
## GroupB:time -1.99      0.66 165   -3.02  0.0030
## GroupC:time -2.69      0.66 165   -4.07  0.0001
##  Correlation: 
##             (Intr) GroupB GroupC time   GrpB:t
## GroupB      -0.707                            
## GroupC      -0.707  0.500                     
## time        -0.280  0.198  0.198              
## GroupB:time  0.198 -0.280 -0.140 -0.707       
## GroupC:time  0.198 -0.140 -0.280 -0.707  0.500
## 
## Standardized Within-Group Residuals:
##      Min       Q1      Med       Q3      Max 
## -2.65509 -0.53335 -0.00376  0.53502  2.69237 
## 
## Number of Observations: 192
## Number of Groups: 24
ind<-corAR1(form = ~ 1 | Subject)
gls.ind<-gls(ability~Group*time, 
             data=stroke.long,
             correlation=ind)
summary(gls.ind)
## Generalized least squares fit by REML
##   Model: ability ~ Group * time 
##   Data: stroke.long 
##    AIC  BIC logLik
##   1320 1346   -652
## 
## Correlation Structure: AR(1)
##  Formula: ~1 | Subject 
##  Parameter estimate(s):
##  Phi 
## 0.95 
## 
## Coefficients:
##             Value Std.Error t-value p-value
## (Intercept)  33.4      7.94    4.21  0.0000
## GroupB       -0.1     11.22   -0.01  0.9918
## GroupC       -6.2     11.22   -0.55  0.5798
## time          6.1      0.84    7.20  0.0000
## GroupB:time  -2.1      1.19   -1.79  0.0744
## GroupC:time  -2.2      1.19   -1.88  0.0622
## 
##  Correlation: 
##             (Intr) GroupB GroupC time   GrpB:t
## GroupB      -0.707                            
## GroupC      -0.707  0.500                     
## time        -0.478  0.338  0.338              
## GroupB:time  0.338 -0.478 -0.239 -0.707       
## GroupC:time  0.338 -0.239 -0.478 -0.707  0.500
## 
## Standardized residuals:
##    Min     Q1    Med     Q3    Max 
## -2.143 -0.586 -0.226  0.653  2.825 
## 
## Residual standard error: 21.4 
## Degrees of freedom: 192 total; 186 residual
gls.ind$coefficients
## (Intercept)      GroupB      GroupC        time GroupB:time GroupC:time 
##      33.393      -0.115      -6.226       6.075      -2.141      -2.238
summary(gls.ind)$coefficients
## (Intercept)      GroupB      GroupC        time GroupB:time GroupC:time 
##      33.393      -0.115      -6.226       6.075      -2.141      -2.238
exch<-corCompSymm(form = ~ 1 | Subject)
gls.exch<-gls(ability~Group*time, 
              data=stroke.long,
              correlation=exch)
summary(gls.exch)
## Generalized least squares fit by REML
##   Model: ability ~ Group * time 
##   Data: stroke.long 
##    AIC  BIC logLik
##   1453 1479   -718
## 
## Correlation Structure: Compound symmetry
##  Formula: ~1 | Subject 
##  Parameter estimate(s):
##   Rho 
## 0.847 
## 
## Coefficients:
##             Value Std.Error t-value p-value
## (Intercept) 29.82      7.50    3.98  0.0001
## GroupB       3.35     10.60    0.32  0.7525
## GroupC      -0.02     10.60    0.00  0.9983
## time         6.32      0.47   13.54  0.0000
## GroupB:time -1.99      0.66   -3.02  0.0029
## GroupC:time -2.69      0.66   -4.07  0.0001
## 
##  Correlation: 
##             (Intr) GroupB GroupC time   GrpB:t
## GroupB      -0.707                            
## GroupC      -0.707  0.500                     
## time        -0.280  0.198  0.198              
## GroupB:time  0.198 -0.280 -0.140 -0.707       
## GroupC:time  0.198 -0.140 -0.280 -0.707  0.500
## 
## Standardized residuals:
##    Min     Q1    Med     Q3    Max 
## -2.186 -0.620 -0.243  0.610  2.919 
## 
## Residual standard error: 21.9 
## Degrees of freedom: 192 total; 186 residual
summary(gls.exch)$coefficients
## (Intercept)      GroupB      GroupC        time GroupB:time GroupC:time 
##     29.8214      3.3482     -0.0223      6.3244     -1.9940     -2.6860
ar1<-corAR1(form = ~ 1 | Subject)
gls.ar1<-gls(ability~Group*time, data=stroke.long,
             correlation=ar1)
summary(gls.ar1)
## Generalized least squares fit by REML
##   Model: ability ~ Group * time 
##   Data: stroke.long 
##    AIC  BIC logLik
##   1320 1346   -652
## 
## Correlation Structure: AR(1)
##  Formula: ~1 | Subject 
##  Parameter estimate(s):
##  Phi 
## 0.95 
## 
## Coefficients:
##             Value Std.Error t-value p-value
## (Intercept)  33.4      7.94    4.21  0.0000
## GroupB       -0.1     11.22   -0.01  0.9918
## GroupC       -6.2     11.22   -0.55  0.5798
## time          6.1      0.84    7.20  0.0000
## GroupB:time  -2.1      1.19   -1.79  0.0744
## GroupC:time  -2.2      1.19   -1.88  0.0622
## 
##  Correlation: 
##             (Intr) GroupB GroupC time   GrpB:t
## GroupB      -0.707                            
## GroupC      -0.707  0.500                     
## time        -0.478  0.338  0.338              
## GroupB:time  0.338 -0.478 -0.239 -0.707       
## GroupC:time  0.338 -0.239 -0.478 -0.707  0.500
## 
## Standardized residuals:
##    Min     Q1    Med     Q3    Max 
## -2.143 -0.586 -0.226  0.653  2.825 
## 
## Residual standard error: 21.4 
## Degrees of freedom: 192 total; 186 residual
un<-corSymm(form = ~ 1 | Subject)
gls.un<-gls(ability~Group+time+Group*time, data=stroke.long,
            correlation=un)
summary(gls.un)
## Generalized least squares fit by REML
##   Model: ability ~ Group + time + Group * time 
##   Data: stroke.long 
##    AIC  BIC logLik
##   1338 1451   -634
## 
## Correlation Structure: General
##  Formula: ~1 | Subject 
##  Parameter estimate(s):
##  Correlation: 
##   1     2     3     4     5     6     7    
## 2 0.931                                    
## 3 0.868 0.931                              
## 4 0.789 0.875 0.952                        
## 5 0.708 0.819 0.892 0.913                  
## 6 0.576 0.731 0.815 0.855 0.965            
## 7 0.426 0.606 0.693 0.782 0.886 0.945      
## 8 0.319 0.522 0.609 0.707 0.840 0.908 0.975
## 
## Coefficients:
##             Value Std.Error t-value p-value
## (Intercept)  35.7      7.94    4.50  0.0000
## GroupB       -5.5     11.24   -0.49  0.6244
## GroupC      -11.3     11.24   -1.01  0.3156
## time          6.7      1.17    5.74  0.0000
## GroupB:time  -3.2      1.65   -1.96  0.0512
## GroupC:time  -3.9      1.65   -2.34  0.0204
## 
##  Correlation: 
##             (Intr) GroupB GroupC time   GrpB:t
## GroupB      -0.707                            
## GroupC      -0.707  0.500                     
## time        -0.760  0.537  0.537              
## GroupB:time  0.537 -0.760 -0.380 -0.707       
## GroupC:time  0.537 -0.380 -0.760 -0.707  0.500
## 
## Standardized residuals:
##     Min      Q1     Med      Q3     Max 
## -2.4710 -0.5084 -0.0354  0.8261  2.7072 
## 
## Residual standard error: 21.3 
## Degrees of freedom: 192 total; 186 residual
AIC(gls.ind,gls.exch,gls.ar1,gls.un)
##          df  AIC
## gls.ind   8 1320
## gls.exch  8 1453
## gls.ar1   8 1320
## gls.un   35 1338
#Exercises 11.1
#Table11.9
dogs
## # A tibble: 40 × 4
##      dog condition     y     x
##    <dbl>     <dbl> <dbl> <dbl>
##  1     1         1  81.7  54.3
##  2     1         2  84.3  62  
##  3     1         3  72.8  62.3
##  4     1         4  71.7  47.3
##  5     1         5  76.7  53.6
##  6     1         6  75.8  38  
##  7     1         7  77.3  54.2
##  8     1         8  86.3  54  
##  9     2         1 105    81.5
## 10     2         2 114.   80.8
## 11     2         3 109.   74.5
## 12     2         4  83.9  71.9
## 13     2         5  89    79.5
## 14     2         6  86.1  73  
## 15     2         7  88.7  74.7
## 16     2         8 118.   88.6
## 17     3         1  95.5  65  
## 18     3         2  95.7  68.3
## 19     3         3  84    67.9
## 20     3         4  85.8  61  
## 21     3         5  98.8  66  
## 22     3         6 106.   81.8
## 23     3         7 106.   71.4
## 24     3         8 115    96  
## 25     4         1 113.   87.5
## # ℹ 15 more rows
#Pooled
Pooled.dogs <- glm(y ~ x, data = dogs)
summary(Pooled.dogs)
## 
## Call:
## glm(formula = y ~ x, data = dogs)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40.7681     6.6173    6.16  3.4e-07 ***
## x             0.7692     0.0916    8.40  3.4e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 62.6)
## 
##     Null deviance: 6795  on 39  degrees of freedom
## Residual deviance: 2378  on 38  degrees of freedom
## AIC: 282.9
## 
## Number of Fisher Scoring iterations: 2
models.dog <- dogs |> group_by(dog) |> 
  do(model = lm(y ~ x, data = .)) |> 
  pull(model) |> purrr::map(tidy)
models.dog
## [[1]]
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)   67.4      14.1       4.78  0.00305
## 2 x              0.206     0.262     0.784 0.463  
## 
## [[2]]
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)   -43.1     51.0      -0.845  0.430 
## 2 x               1.82     0.651     2.80   0.0313
## 
## [[3]]
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)   43.2      15.6        2.76  0.0327
## 2 x              0.765     0.214      3.57  0.0117
## 
## [[4]]
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)   60.5      17.2        3.52  0.0126
## 2 x              0.547     0.212      2.58  0.0420
## 
## [[5]]
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)   58.9      17.6        3.34  0.0155
## 2 x              0.530     0.244      2.17  0.0729
models.dog2 <- models.dog |> purrr::map(\(table) table[,1:3]) |> 
  map(~ (pivot_wider(.x, names_from=1,values_from= c(2,3))))
models.dog2
## [[1]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_x `std.error_(Intercept)` std.error_x
##                    <dbl>      <dbl>                   <dbl>       <dbl>
## 1                   67.4      0.206                    14.1       0.262
## 
## [[2]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_x `std.error_(Intercept)` std.error_x
##                    <dbl>      <dbl>                   <dbl>       <dbl>
## 1                  -43.1       1.82                    51.0       0.651
## 
## [[3]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_x `std.error_(Intercept)` std.error_x
##                    <dbl>      <dbl>                   <dbl>       <dbl>
## 1                   43.2      0.765                    15.6       0.214
## 
## [[4]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_x `std.error_(Intercept)` std.error_x
##                    <dbl>      <dbl>                   <dbl>       <dbl>
## 1                   60.5      0.547                    17.2       0.212
## 
## [[5]]
## # A tibble: 1 × 4
##   `estimate_(Intercept)` estimate_x `std.error_(Intercept)` std.error_x
##                    <dbl>      <dbl>                   <dbl>       <dbl>
## 1                   58.9      0.530                    17.6       0.244
names(models.dog2) <- seq(1:5)
models.dog3 <- bind_rows(models.dog2, .id = "dog")
models.dog3["dog"] <- c(seq(1:5))
models.dog3
## # A tibble: 5 × 5
##     dog `estimate_(Intercept)` estimate_x `std.error_(Intercept)` std.error_x
##   <int>                  <dbl>      <dbl>                   <dbl>       <dbl>
## 1     1                   67.4      0.206                    14.1       0.262
## 2     2                  -43.1      1.82                     51.0       0.651
## 3     3                   43.2      0.765                    15.6       0.214
## 4     4                   60.5      0.547                    17.2       0.212
## 5     5                   58.9      0.530                    17.6       0.244
mean(models.dog3$`estimate_(Intercept)`)
## [1] 37.4
#Data reduction
#dogs, ignoring conditions

#Intercept (s.e.)
summary(lm(`estimate_(Intercept)` ~ 1, data = models.dog3))
## 
## Call:
## lm(formula = `estimate_(Intercept)` ~ 1, data = models.dog3)
## 
## Residuals:
##     1     2     3     4     5 
##  30.0 -80.5   5.8  23.1  21.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)     37.4       20.5    1.82     0.14
## 
## Residual standard error: 45.8 on 4 degrees of freedom
#Slope (s.e.)
summary(lm(estimate_x ~ 1, data = models.dog3))
## 
## Call:
## lm(formula = estimate_x ~ 1, data = models.dog3)
## 
## Residuals:
##        1        2        3        4        5 
## -0.56820  1.04728 -0.00826 -0.22713 -0.24370 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    0.774      0.277     2.8    0.049 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.619 on 4 degrees of freedom
#Data reduction
#conditions, ignoring dogs
models.dog4 <- dogs |> group_by(condition) |> 
  do(model = lm(y ~ x, data = .)) |> 
  pull(model) |> purrr::map(tidy)
models.dog5 <- models.dog4 |> purrr::map(\(table) table[,1:3]) |> 
  map(~ (pivot_wider(.x, names_from=1,values_from= c(2,3))))
names(models.dog5) <- seq(1:8)
models.dog6 <- bind_rows(models.dog5, .id = "condition")
models.dog6["condition"] <- c(seq(1:8))
models.dog6
## # A tibble: 8 × 5
##   condition `estimate_(Intercept)` estimate_x std.error_(Intercept…¹ std.error_x
##       <int>                  <dbl>      <dbl>                  <dbl>       <dbl>
## 1         1                  38.0       0.830                   10.1       0.135
## 2         2                  28.2       0.952                   21.8       0.278
## 3         3                   3.72      1.25                    42.6       0.583
## 4         4                  40.8       0.717                   32.0       0.507
## 5         5                  43.7       0.741                   24.7       0.327
## 6         6                  56.0       0.542                   15.7       0.246
## 7         7                  62.8       0.417                   31.3       0.453
## 8         8                  48.9       0.693                   19.1       0.242
## # ℹ abbreviated name: ¹​`std.error_(Intercept)`
#Data reduction
#conditions, ignoring dogs

#Intercept (s.e.)
summary(lm(`estimate_(Intercept)` ~ 1, data = models.dog6))
## 
## Call:
## lm(formula = `estimate_(Intercept)` ~ 1, data = models.dog6)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -36.54  -4.75   1.98  10.45  22.54 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    40.26       6.45    6.24  0.00043 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.2 on 7 degrees of freedom
#Slope (s.e.)
summary(lm(estimate_x ~ 1, data = models.dog6))
## 
## Call:
## lm(formula = estimate_x ~ 1, data = models.dog6)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3504 -0.1123 -0.0389  0.0924  0.4826 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.768      0.090    8.53    6e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.254 on 7 degrees of freedom
#Random effects
#dogs random, conditions fixed
library(nlme)
rndeff.dog<-lme(y~factor(condition)+x,
            data=dogs,
            random= ~ 1|dog)
summary(rndeff.dog)
## Linear mixed-effects model fit by REML
##   Data: dogs 
##   AIC BIC logLik
##   264 280   -121
## 
## Random effects:
##  Formula: ~1 | dog
##         (Intercept) Residual
## StdDev:     0.00311     8.45
## 
## Fixed effects:  y ~ factor(condition) + x 
##                    Value Std.Error DF t-value p-value
## (Intercept)         45.4      8.79 27    5.17   0.000
## factor(condition)2   0.1      5.36 27    0.01   0.991
## factor(condition)3  -3.8      5.34 27   -0.71   0.484
## factor(condition)4  -5.3      5.47 27   -0.97   0.339
## factor(condition)5  -0.8      5.34 27   -0.15   0.880
## factor(condition)6  -0.9      5.49 27   -0.17   0.869
## factor(condition)7  -3.8      5.37 27   -0.71   0.482
## factor(condition)8   0.8      5.36 27    0.15   0.882
## x                    0.7      0.11 27    6.75   0.000
##  Correlation: 
##                    (Intr) fct()2 fct()3 fct()4 fct()5 fct()6 fct()7 fct()8
## factor(condition)2 -0.232                                                 
## factor(condition)3 -0.321  0.497                                          
## factor(condition)4 -0.493  0.470  0.492                                   
## factor(condition)5 -0.300  0.499  0.500  0.487                            
## factor(condition)6 -0.503  0.467  0.491  0.525  0.486                     
## factor(condition)7 -0.398  0.487  0.499  0.508  0.497  0.508              
## factor(condition)8 -0.227  0.503  0.497  0.468  0.499  0.466  0.486       
## x                  -0.903 -0.078  0.019  0.217 -0.005  0.230  0.106 -0.084
## 
## Standardized Within-Group Residuals:
##     Min      Q1     Med      Q3     Max 
## -1.6785 -0.7484  0.0576  0.5904  1.5730 
## 
## Number of Observations: 40
## Number of Groups: 5
#Random effects
#conditions random, dogs fixed
rndeff.condition<-lme(y~factor(dog)+x,
            data=dogs,
            random= ~ 1|condition)
summary(rndeff.condition)
## Linear mixed-effects model fit by REML
##   Data: dogs 
##   AIC BIC logLik
##   272 284   -128
## 
## Random effects:
##  Formula: ~1 | condition
##         (Intercept) Residual
## StdDev:    0.000347     7.88
## 
## Fixed effects:  y ~ factor(dog) + x 
##              Value Std.Error DF t-value p-value
## (Intercept)   44.9      7.28 27    6.16  0.0000
## factor(dog)2   5.1      5.04 27    1.02  0.3183
## factor(dog)3   8.2      4.61 27    1.77  0.0875
## factor(dog)4   9.1      5.19 27    1.75  0.0916
## factor(dog)5   7.0      4.57 27    1.53  0.1388
## x              0.6      0.13 27    4.97  0.0000
##  Correlation: 
##              (Intr) fct()2 fct()3 fct()4 fct()5
## factor(dog)2  0.365                            
## factor(dog)3  0.249  0.658                     
## factor(dog)4  0.396  0.703  0.663              
## factor(dog)5  0.234  0.653  0.631  0.657       
## x            -0.924 -0.623 -0.520 -0.651 -0.506
## 
## Standardized Within-Group Residuals:
##    Min     Q1    Med     Q3    Max 
## -1.767 -0.620  0.202  0.571  1.625 
## 
## Number of Observations: 40
## Number of Groups: 8
#Random effects
#both random,
rndeff.both<-lme(y~x,
            data=dogs,
            random= ~ dog|condition)
summary(rndeff.both)
## Linear mixed-effects model fit by REML
##   Data: dogs 
##   AIC BIC logLik
##   290 299   -139
## 
## Random effects:
##  Formula: ~dog | condition
##  Structure: General positive-definite, Log-Cholesky parametrization
##             StdDev   Corr  
## (Intercept) 2.81e-04 (Intr)
## dog         7.32e-13 0     
## Residual    7.91e+00       
## 
## Fixed effects:  y ~ x 
##             Value Std.Error DF t-value p-value
## (Intercept)  40.8      6.62 31    6.16       0
## x             0.8      0.09 31    8.40       0
##  Correlation: 
##   (Intr)
## x -0.982
## 
## Standardized Within-Group Residuals:
##    Min     Q1    Med     Q3    Max 
## -2.009 -0.675  0.122  0.757  1.633 
## 
## Number of Observations: 40
## Number of Groups: 8
#GEE conditions fixed
dogs$dog2 <- seq(1:40)
gee.ind.dog<-geeglm(y~factor(dog)+x,
                family=gaussian("identity"), 
                data=dogs,
                #weights=condition,
                id=condition,
                #corstr = "fixed",
                std.err = 'san.se',
                corst="independence")
summary(gee.ind.dog)
## 
## Call:
## geeglm(formula = y ~ factor(dog) + x, family = gaussian("identity"), 
##     data = dogs, id = condition, corstr = "independence", std.err = "san.se")
## 
##  Coefficients:
##              Estimate Std.err  Wald Pr(>|W|)    
## (Intercept)   44.8626  4.6430 93.36  < 2e-16 ***
## factor(dog)2   5.1232  4.7041  1.19    0.276    
## factor(dog)3   8.1755  3.5073  5.43    0.020 *  
## factor(dog)4   9.0770  4.0122  5.12    0.024 *  
## factor(dog)5   6.9657  3.5274  3.90    0.048 *  
## x              0.6288  0.0818 59.04  1.5e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)     52.8    8.86
## Number of clusters:   40  Maximum cluster size: 1
gee.ind.condition<-geeglm(y~factor(condition)+x,
                family=gaussian("identity"), 
                data=dogs,
                #weights=condition,
                id=dog,
                #corstr = "fixed",
                std.err = 'san.se',
                corst="independence")
summary(gee.ind.condition)
## 
## Call:
## geeglm(formula = y ~ factor(condition) + x, family = gaussian("identity"), 
##     data = dogs, id = dog, corstr = "independence", std.err = "san.se")
## 
##  Coefficients:
##                    Estimate Std.err  Wald Pr(>|W|)    
## (Intercept)         45.3929  6.7752 44.89  2.1e-11 ***
## factor(condition)2   0.0592  2.2242  0.00    0.979    
## factor(condition)3  -3.7953  3.9743  0.91    0.340    
## factor(condition)4  -5.3275  2.9544  3.25    0.071 .  
## factor(condition)5  -0.8148  4.2951  0.04    0.850    
## factor(condition)6  -0.9122  3.6664  0.06    0.804    
## factor(condition)7  -3.8340  4.3793  0.77    0.381    
## factor(condition)8   0.8007  3.9208  0.04    0.838    
## x                    0.7284  0.0870 70.06  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)     55.3    10.2
## Number of clusters:   5  Maximum cluster size: 8

EXERCISES 11.2

\[\mathbf b=\mathbb E(\hat{\boldsymbol\beta})=\mathbb E((\mathbf{X}^T\mathbf{V}^{-1}\mathbf{X})^{-1}\mathbf{X}^T\mathbf{V}^{-1}\mathbf{y})\\ =(\mathbf{X}^T\mathbf{V}^{-1}\mathbf{X})^{-1}\mathbf{X}^T\mathbf{V}^{-1}\mathbb E(\mathbf{y})\\ =(\mathbf{X}^T\mathbf{V}^{-1}\mathbf{X})^{-1}\mathbf{X}^T\mathbf{V}^{-1}\mathbf{X}\hat{\boldsymbol\beta}\\ =\hat{\boldsymbol\beta}\]

ear
## # A tibble: 18 × 4
##    age    treatment `number clear` frequency
##    <chr>  <chr>              <dbl>     <dbl>
##  1 < 2    CEF                    0         8
##  2 < 2    CEF                    1         2
##  3 < 2    CEF                    2         8
##  4 < 2    AMO                    0        11
##  5 < 2    AMO                    1         2
##  6 < 2    AMO                    2         2
##  7 2 to 5 CEF                    0         6
##  8 2 to 5 CEF                    1         6
##  9 2 to 5 CEF                    2        10
## 10 2 to 5 AMO                    0         3
## 11 2 to 5 AMO                    1         1
## 12 2 to 5 AMO                    2         5
## 13 >= 6   CEF                    0         0
## 14 >= 6   CEF                    1         1
## 15 >= 6   CEF                    2         3
## 16 >= 6   AMO                    0         1
## 17 >= 6   AMO                    1         0
## 18 >= 6   AMO                    2         6
ear2 <- ear |> group_by(age, treatment) |> mutate(ratio=frequency/sum(frequency)) |> ungroup()
ear2
## # A tibble: 18 × 5
##    age    treatment `number clear` frequency ratio
##    <chr>  <chr>              <dbl>     <dbl> <dbl>
##  1 < 2    CEF                    0         8 0.444
##  2 < 2    CEF                    1         2 0.111
##  3 < 2    CEF                    2         8 0.444
##  4 < 2    AMO                    0        11 0.733
##  5 < 2    AMO                    1         2 0.133
##  6 < 2    AMO                    2         2 0.133
##  7 2 to 5 CEF                    0         6 0.273
##  8 2 to 5 CEF                    1         6 0.273
##  9 2 to 5 CEF                    2        10 0.455
## 10 2 to 5 AMO                    0         3 0.333
## 11 2 to 5 AMO                    1         1 0.111
## 12 2 to 5 AMO                    2         5 0.556
## 13 >= 6   CEF                    0         0 0    
## 14 >= 6   CEF                    1         1 0.25 
## 15 >= 6   CEF                    2         3 0.75 
## 16 >= 6   AMO                    0         1 0.143
## 17 >= 6   AMO                    1         0 0    
## 18 >= 6   AMO                    2         6 0.857
#Pooled
Pooled.ear1 <- glm(`number clear` ~ factor(age)*factor(treatment),
                   weights = ratio,
                  data = ear2)
summary(Pooled.ear1)
## 
## Call:
## glm(formula = `number clear` ~ factor(age) * factor(treatment), 
##     data = ear2, weights = ratio)
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               0.400      0.601    0.67     0.52
## factor(age)>= 6                           1.314      0.849    1.55     0.15
## factor(age)2 to 5                         0.822      0.849    0.97     0.36
## factor(treatment)CEF                      0.600      0.849    0.71     0.50
## factor(age)>= 6:factor(treatment)CEF     -0.564      1.201   -0.47     0.65
## factor(age)2 to 5:factor(treatment)CEF   -0.640      1.201   -0.53     0.61
## 
## (Dispersion parameter for gaussian family taken to be 0.361)
## 
##     Null deviance: 4.8536  on 15  degrees of freedom
## Residual deviance: 3.6066  on 10  degrees of freedom
## AIC: Inf
## 
## Number of Fisher Scoring iterations: 2
ear2$age <- factor(ear2$age)
ear2$treatment <- factor(ear2$treatment)
ear2
## # A tibble: 18 × 5
##    age    treatment `number clear` frequency ratio
##    <fct>  <fct>              <dbl>     <dbl> <dbl>
##  1 < 2    CEF                    0         8 0.444
##  2 < 2    CEF                    1         2 0.111
##  3 < 2    CEF                    2         8 0.444
##  4 < 2    AMO                    0        11 0.733
##  5 < 2    AMO                    1         2 0.133
##  6 < 2    AMO                    2         2 0.133
##  7 2 to 5 CEF                    0         6 0.273
##  8 2 to 5 CEF                    1         6 0.273
##  9 2 to 5 CEF                    2        10 0.455
## 10 2 to 5 AMO                    0         3 0.333
## 11 2 to 5 AMO                    1         1 0.111
## 12 2 to 5 AMO                    2         5 0.556
## 13 >= 6   CEF                    0         0 0    
## 14 >= 6   CEF                    1         1 0.25 
## 15 >= 6   CEF                    2         3 0.75 
## 16 >= 6   AMO                    0         1 0.143
## 17 >= 6   AMO                    1         0 0    
## 18 >= 6   AMO                    2         6 0.857
models <- ear2 |> group_by(age) |> 
  do(model = lm(`number clear` ~ factor(treatment),
                   weights = ratio, data = .)) |> 
  pull(model) |> purrr::map(tidy)
models2 <- models |> purrr::map(\(table) table[,1:3]) |> 
  map(~ (pivot_wider(.x, names_from=1,values_from= c(2,3))))
names(models2) <- c("< 2",">6","2 to 5")
models3 <- bind_rows(models2, .id = "age")
models3
## # A tibble: 3 × 5
##   age    `estimate_(Intercept)` estimate_factor(treatme…¹ std.error_(Intercept…²
##   <chr>                   <dbl>                     <dbl>                  <dbl>
## 1 < 2                     0.400                    0.6                     0.591
## 2 >6                      1.71                     0.0357                  0.582
## 3 2 to 5                  1.22                    -0.0404                  0.619
## # ℹ abbreviated names: ¹​`estimate_factor(treatment)CEF`,
## #   ²​`std.error_(Intercept)`
## # ℹ 1 more variable: `std.error_factor(treatment)CEF` <dbl>
models <- ear2 |> group_by(treatment) |> 
  do(model = lm(`number clear` ~ factor(age),
                   weights = ratio, data = .)) |> 
  pull(model) |> purrr::map(tidy)
models2 <- models |> purrr::map(\(table) table[,1:3]) |> 
  map(~ (pivot_wider(.x, names_from=1,values_from= c(2,3))))
names(models2) <- c("AMO","CEF")
models3 <- bind_rows(models2, .id = "treatment")
models3
## # A tibble: 2 × 7
##   treatment `estimate_(Intercept)` estimate_factor(age)…¹ estimate_factor(age)…²
##   <chr>                      <dbl>                  <dbl>                  <dbl>
## 1 AMO                          0.4                   1.31                  0.822
## 2 CEF                          1                     0.75                  0.182
## # ℹ abbreviated names: ¹​`estimate_factor(age)>= 6`,
## #   ²​`estimate_factor(age)2 to 5`
## # ℹ 3 more variables: `std.error_(Intercept)` <dbl>,
## #   `std.error_factor(age)>= 6` <dbl>, `std.error_factor(age)2 to 5` <dbl>
#logistic model 
library(nnet)
res.ear=multinom(`number clear`~age+treatment, weights=frequency,data=ear2)
## # weights:  15 (8 variable)
## initial  value 82.395922 
## iter  10 value 68.432035
## final  value 68.431944 
## converged
summary(res.ear)
## Call:
## multinom(formula = `number clear` ~ age + treatment, data = ear2, 
##     weights = frequency)
## 
## Coefficients:
##   (Intercept) age>= 6 age2 to 5 treatmentCEF
## 1       -2.24    1.86      1.19        1.142
## 2       -1.08    3.05      1.07        0.788
## 
## Std. Errors:
##   (Intercept) age>= 6 age2 to 5 treatmentCEF
## 1       0.775    1.55     0.760        0.795
## 2       0.520    1.15     0.585        0.578
## 
## Residual Deviance: 137 
## AIC: 153
t(res.ear$fitted.values[seq(1,18, by=3),])
##       1      4     7    10     13     16
## 0 0.480 0.6908 0.234 0.428 0.0527 0.1127
## 1 0.161 0.0739 0.257 0.150 0.1140 0.0777
## 2 0.359 0.2353 0.509 0.423 0.8333 0.8096
res.ear$edf
## [1] 8
ear3 <- ear |> group_by(age, treatment) |> mutate(n=sum(frequency)) |> ungroup()
estimate <- matrix(t(res.ear$fitted.values[seq(1,18, by=3),]))*ear3$n
estimate
##         [,1]
##  [1,]  8.638
##  [2,]  2.892
##  [3,]  6.470
##  [4,] 10.363
##  [5,]  1.108
##  [6,]  3.530
##  [7,]  5.151
##  [8,]  5.652
##  [9,] 11.196
## [10,]  3.848
## [11,]  1.348
## [12,]  3.803
## [13,]  0.211
## [14,]  0.456
## [15,]  3.333
## [16,]  0.789
## [17,]  0.544
## [18,]  5.667
observe <- matrix(ear2$frequency)
observe
##       [,1]
##  [1,]    8
##  [2,]    2
##  [3,]    8
##  [4,]   11
##  [5,]    2
##  [6,]    2
##  [7,]    6
##  [8,]    6
##  [9,]   10
## [10,]    3
## [11,]    1
## [12,]    5
## [13,]    0
## [14,]    1
## [15,]    3
## [16,]    1
## [17,]    0
## [18,]    6
#X^2
sum(((observe-estimate)/estimate)^2)
## [1] 4.76

Chapter12 Bayesian Analysis

P_theta <- 0.0333
for(theta in c(seq(0,0.5, 0.1))){
  #P(y|theta)
  print(choose(10,7)*theta^7*(1-theta)^3)
}
## [1] 0
## [1] 8.75e-06
## [1] 0.000786
## [1] 0.009
## [1] 0.0425
## [1] 0.117
Likelihood <- function(theta){
  choose(10,7)*theta^7*(1-theta)^3
}
#Table 12.1
library(tidyverse)
tibble(theta = c(seq(0,1.0, 0.1)), 
       Hypothesis=c(rep('H0', 6),rep('H1', 5))) |> mutate(
          Prior = c(rep(0.0333, 6),rep(0.16,5)),
          Likelihood = choose(10,7)*theta^7*(1-theta)^3,
          LikelihoodXPrior= Prior*Likelihood,
          Posterior = LikelihoodXPrior/sum(LikelihoodXPrior)
)
## # A tibble: 11 × 6
##    theta Hypothesis  Prior Likelihood LikelihoodXPrior  Posterior
##    <dbl> <chr>       <dbl>      <dbl>            <dbl>      <dbl>
##  1   0   H0         0.0333 0               0           0         
##  2   0.1 H0         0.0333 0.00000875      0.000000291 0.00000235
##  3   0.2 H0         0.0333 0.000786        0.0000262   0.000211  
##  4   0.3 H0         0.0333 0.00900         0.000300    0.00241   
##  5   0.4 H0         0.0333 0.0425          0.00141     0.0114    
##  6   0.5 H0         0.0333 0.117           0.00390     0.0314    
##  7   0.6 H1         0.16   0.215           0.0344      0.277     
##  8   0.7 H1         0.16   0.267           0.0427      0.344     
##  9   0.8 H1         0.16   0.201           0.0322      0.260     
## 10   0.9 H1         0.16   0.0574          0.00918     0.0740    
## 11   1   H1         0.16   0               0           0
log(log(0.15)/log(0.25))
## [1] 0.314
x = seq(-0.8, 1.2, 0.01)
y = dnorm(x, mean = 0, sd = 0.1907)
Likelihood = dnorm(x, mean = 0.58, sd = 0.2266)
h=function(x){dnorm(x, mean = 0, sd = 0.1907)*dnorm(x, mean = 0.58, sd = 0.2266)}
c = integrate(h,-0.8, 1.2)
Posterior = y*Likelihood/c$value
plot(x, y, type = "l", col = "gray",
     lty = 2, xlab="LHR", cex.lab=1.5, cex.main=1.5,
     ylim = c(0,3), xlim = c(-0.8,1.2),
     lwd=2)
text(-0.5, .6*max(y) , "Sceptical prior", cex=1.5 )
lines(x, Likelihood, type = "l", col = "gray",
     lty = 1,  cex.lab=1.5,
     cex.main=1.5, lwd=2)
text(1.0, .6*max(Likelihood) , "Likelihood", cex=1.5 )
lines(x, Posterior, type = "l", col = "skyblue",
     lty = 1,  cex.lab=1.5,
     cex.main=1.5, lwd=2)
text(0.3, max(Posterior) +0.2, "Posterior", col = "skyblue", 
     cex=1.5 )

x = seq(-0.8, 1.2, 0.01)
y = dunif(x, min = -0.6, max = 1.2)
Likelihood = dnorm(x, mean = 0.58, sd = 0.2266)
h=function(x){dunif(x, min = -0.6, max = 1.2)*dnorm(x, mean = 0.58, sd = 0.2266)}
c = integrate(h,-0.8, 1.2)
Posterior = y*Likelihood/c$value
plot(x, y, type = "l", col = "gray",
     lty = 2, xlab="LHR", cex.lab=1.5, cex.main=1.5,
     ylim = c(0,3), xlim = c(-0.8,1.2),
     lwd=2)
text(-0.5, .6*max(y) , "Uninformative prior", cex=1.5 )
lines(x, Likelihood, type = "l", col = "gray",
     lty = 1,  cex.lab=1.5,
     cex.main=1.5, lwd=2)
text(1.0, .6*max(Likelihood) , "Likelihood", cex=1.5 )
lines(x, Posterior, type = "l", col = "skyblue",
     lty = 1,  cex.lab=1.5,
     cex.main=1.5, lwd=2)
text(0.3, max(Posterior) +0.2, "Posterior", col = "skyblue", 
     cex=1.5 )

x = seq(-0.8, 1.2, 0.01)
y = dnorm(x, mean = 0, sd = 0.1907)
Likelihood = dnorm(x, mean = 0.58, sd = 0.2266)
h=function(x){dnorm(x, mean = 0, sd = 0.1907)*dnorm(x, mean = 0.58, sd = 0.2266)}
c = integrate(h,-0.8, 1.2)

Posterior_f = function(x){dnorm(x, mean = 0, sd = 0.1907)*dnorm(x, mean = 0.58, sd = 0.2266)/c$value}

#The mean
x[Posterior_f(x) == max(Posterior_f(x))]
## [1] 0.24
sum(Posterior_f(x)*x*0.01)
## [1] 0.24
#The probability that improvement is greater than 10% corresponding to LHR = 0.3137)
x2 = seq(0.3137, 1.2, 0.01)
sum(Posterior_f(x2)*0.01)
## [1] 0.32
qbeta(0.025,shape1=1, shape2=92)
## [1] 0.000275
qbeta(0.975,shape1=1, shape2=92)
## [1] 0.0393
#Exercises 12.1
library(tidyverse)
data1 <- tibble(theta = c(seq(0,1.0, 0.1)), 
       Hypothesis=c(rep('H0', 6),rep('H1', 5))) |> mutate(
          Prior = c(rep(0.5/6, 6),rep(0.5/5,5)),
          Likelihood = choose(10,5)*theta^5*(1-theta)^5,
          LikelihoodXPrior= Prior*Likelihood,
          Posterior = LikelihoodXPrior/sum(LikelihoodXPrior)
)
data1
## # A tibble: 11 × 6
##    theta Hypothesis  Prior Likelihood LikelihoodXPrior Posterior
##    <dbl> <chr>       <dbl>      <dbl>            <dbl>     <dbl>
##  1   0   H0         0.0833    0               0          0      
##  2   0.1 H0         0.0833    0.00149         0.000124   0.00153
##  3   0.2 H0         0.0833    0.0264          0.00220    0.0271 
##  4   0.3 H0         0.0833    0.103           0.00858    0.106  
##  5   0.4 H0         0.0833    0.201           0.0167     0.206  
##  6   0.5 H0         0.0833    0.246           0.0205     0.252  
##  7   0.6 H1         0.1       0.201           0.0201     0.247  
##  8   0.7 H1         0.1       0.103           0.0103     0.127  
##  9   0.8 H1         0.1       0.0264          0.00264    0.0325 
## 10   0.9 H1         0.1       0.00149         0.000149   0.00183
## 11   1   H1         0.1       0               0          0
sum(data1$Posterior[data1$theta>0.5])
## [1] 0.408
data1 <- tibble(theta = c(seq(0,1.0, 0.1)), 
       Hypothesis=c(rep('H0', 6),rep('H1', 5))) |> mutate(
          Prior = c(rep(0.5/6, 6),rep(0.5/5,5)),
          Likelihood = choose(10,1)*theta^1*(1-theta)^9,
          LikelihoodXPrior= Prior*Likelihood,
          Posterior = LikelihoodXPrior/sum(LikelihoodXPrior)
)
data1
## # A tibble: 11 × 6
##    theta Hypothesis  Prior    Likelihood LikelihoodXPrior    Posterior
##    <dbl> <chr>       <dbl>         <dbl>            <dbl>        <dbl>
##  1   0   H0         0.0833 0                     0        0           
##  2   0.1 H0         0.0833 0.387                 3.23e- 2 0.467       
##  3   0.2 H0         0.0833 0.268                 2.24e- 2 0.324       
##  4   0.3 H0         0.0833 0.121                 1.01e- 2 0.146       
##  5   0.4 H0         0.0833 0.0403                3.36e- 3 0.0486      
##  6   0.5 H0         0.0833 0.00977               8.14e- 4 0.0118      
##  7   0.6 H1         0.1    0.00157               1.57e- 4 0.00228     
##  8   0.7 H1         0.1    0.000138              1.38e- 5 0.000199    
##  9   0.8 H1         0.1    0.00000410            4.10e- 7 0.00000593  
## 10   0.9 H1         0.1    0.00000000900         9.00e-10 0.0000000130
## 11   1   H1         0.1    0                     0        0
sum(data1$Posterior[data1$theta>0.5])
## [1] 0.00248
data1 <- tibble(theta = c(seq(0,1.0, 0.1)), 
       Hypothesis=c(rep('H0', 6),rep('H1', 5))) |> mutate(
          Prior = c(rep(0.01/6, 6),rep(0.99/5,5)),
          Likelihood = choose(10,5)*theta^5*(1-theta)^5,
          LikelihoodXPrior= Prior*Likelihood,
          Posterior = LikelihoodXPrior/sum(LikelihoodXPrior)
)
data1
## # A tibble: 11 × 6
##    theta Hypothesis   Prior Likelihood LikelihoodXPrior Posterior
##    <dbl> <chr>        <dbl>      <dbl>            <dbl>     <dbl>
##  1   0   H0         0.00167    0             0          0        
##  2   0.1 H0         0.00167    0.00149       0.00000248 0.0000372
##  3   0.2 H0         0.00167    0.0264        0.0000440  0.000661 
##  4   0.3 H0         0.00167    0.103         0.000172   0.00258  
##  5   0.4 H0         0.00167    0.201         0.000334   0.00502  
##  6   0.5 H0         0.00167    0.246         0.000410   0.00616  
##  7   0.6 H1         0.198      0.201         0.0397     0.597    
##  8   0.7 H1         0.198      0.103         0.0204     0.306    
##  9   0.8 H1         0.198      0.0264        0.00523    0.0786   
## 10   0.9 H1         0.198      0.00149       0.000295   0.00442  
## 11   1   H1         0.198      0             0          0
sum(data1$Posterior[data1$theta>0.5])
## [1] 0.986
data1 <- tibble(theta = c(seq(0,1.0, 0.1)), 
       Hypothesis=c(rep('H0', 6),rep('H1', 5))) |> mutate(
          Prior = c(rep(0.01/6, 6),rep(0.99/5,5)),
          Likelihood = choose(10,1)*theta^1*(1-theta)^9,
          LikelihoodXPrior= Prior*Likelihood,
          Posterior = LikelihoodXPrior/sum(LikelihoodXPrior)
)
data1
## # A tibble: 11 × 6
##    theta Hypothesis   Prior    Likelihood LikelihoodXPrior  Posterior
##    <dbl> <chr>        <dbl>         <dbl>            <dbl>      <dbl>
##  1   0   H0         0.00167 0                0             0         
##  2   0.1 H0         0.00167 0.387            0.000646      0.376     
##  3   0.2 H0         0.00167 0.268            0.000447      0.260     
##  4   0.3 H0         0.00167 0.121            0.000202      0.117     
##  5   0.4 H0         0.00167 0.0403           0.0000672     0.0391    
##  6   0.5 H0         0.00167 0.00977          0.0000163     0.00947   
##  7   0.6 H1         0.198   0.00157          0.000311      0.181     
##  8   0.7 H1         0.198   0.000138         0.0000273     0.0159    
##  9   0.8 H1         0.198   0.00000410       0.000000811   0.000472  
## 10   0.9 H1         0.198   0.00000000900    0.00000000178 0.00000104
## 11   1   H1         0.198   0                0             0
sum(data1$Posterior[data1$theta>0.5])
## [1] 0.198
library(Bolstad)
theta.space<-(0:10)/10
theta.mass<-c(rep(0.5/6,6),rep(0.5/5,5))
post<-binodp(5,10,pi=theta.space,pi.prior=theta.mass,ret=TRUE)

## Conditional distribution of x given pi and  n:
## 
##          0      1      2      3      4      5      6      7      8      9
## 0   1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 0.1 0.3487 0.3874 0.1937 0.0574 0.0112 0.0015 0.0001 0.0000 0.0000 0.0000
## 0.2 0.1074 0.2684 0.3020 0.2013 0.0881 0.0264 0.0055 0.0008 0.0001 0.0000
## 0.3 0.0282 0.1211 0.2335 0.2668 0.2001 0.1029 0.0368 0.0090 0.0014 0.0001
## 0.4 0.0060 0.0403 0.1209 0.2150 0.2508 0.2007 0.1115 0.0425 0.0106 0.0016
## 0.5 0.0010 0.0098 0.0439 0.1172 0.2051 0.2461 0.2051 0.1172 0.0439 0.0098
## 0.6 0.0001 0.0016 0.0106 0.0425 0.1115 0.2007 0.2508 0.2150 0.1209 0.0403
## 0.7 0.0000 0.0001 0.0014 0.0090 0.0368 0.1029 0.2001 0.2668 0.2335 0.1211
## 0.8 0.0000 0.0000 0.0001 0.0008 0.0055 0.0264 0.0881 0.2013 0.3020 0.2684
## 0.9 0.0000 0.0000 0.0000 0.0000 0.0001 0.0015 0.0112 0.0574 0.1937 0.3874
## 1   0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
##         10
## 0   0.0000
## 0.1 0.0000
## 0.2 0.0000
## 0.3 0.0000
## 0.4 0.0001
## 0.5 0.0010
## 0.6 0.0060
## 0.7 0.0282
## 0.8 0.1074
## 0.9 0.3487
## 1   1.0000
## 
## Joint distribution:
## 
##            0      1      2      3      4      5      6      7      8      9
##  [1,] 0.0833 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
##  [2,] 0.0291 0.0323 0.0161 0.0048 0.0009 0.0001 0.0000 0.0000 0.0000 0.0000
##  [3,] 0.0089 0.0224 0.0252 0.0168 0.0073 0.0022 0.0005 0.0001 0.0000 0.0000
##  [4,] 0.0024 0.0101 0.0195 0.0222 0.0167 0.0086 0.0031 0.0008 0.0001 0.0000
##  [5,] 0.0005 0.0034 0.0101 0.0179 0.0209 0.0167 0.0093 0.0035 0.0009 0.0001
##  [6,] 0.0001 0.0008 0.0037 0.0098 0.0171 0.0205 0.0171 0.0098 0.0037 0.0008
##  [7,] 0.0000 0.0002 0.0011 0.0042 0.0111 0.0201 0.0251 0.0215 0.0121 0.0040
##  [8,] 0.0000 0.0000 0.0001 0.0009 0.0037 0.0103 0.0200 0.0267 0.0233 0.0121
##  [9,] 0.0000 0.0000 0.0000 0.0001 0.0006 0.0026 0.0088 0.0201 0.0302 0.0268
## [10,] 0.0000 0.0000 0.0000 0.0000 0.0000 0.0001 0.0011 0.0057 0.0194 0.0387
## [11,] 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
##           10
##  [1,] 0.0000
##  [2,] 0.0000
##  [3,] 0.0000
##  [4,] 0.0000
##  [5,] 0.0000
##  [6,] 0.0001
##  [7,] 0.0006
##  [8,] 0.0028
##  [9,] 0.0107
## [10,] 0.0349
## [11,] 0.1000
## 
## Marginal distribution of x:
## 
##          0      1      2      3      4      5      6      7      8      9    10
## [1,] 0.124 0.0691 0.0757 0.0767 0.0783 0.0813 0.0849 0.0882 0.0897 0.0827 0.149
## 
## 
##      Prior Likelihood Posterior
## 0   0.0833    0.00000   0.00000
## 0.1 0.0833    0.00149   0.00153
## 0.2 0.0833    0.02642   0.02709
## 0.3 0.0833    0.10292   0.10552
## 0.4 0.0833    0.20066   0.20572
## 0.5 0.0833    0.24609   0.25231
## 0.6 0.1000    0.20066   0.24687
## 0.7 0.1000    0.10292   0.12662
## 0.8 0.1000    0.02642   0.03251
## 0.9 0.1000    0.00149   0.00183
## 1   0.1000    0.00000   0.00000
sum(post$posterior[7:11])
## [1] 0.408
#Exercises 12.2
mean <- 0.3137
sigma <- 0.1907
prior_f <- function(theta) {1/(sigma*sqrt(2*base::pi))*exp(-(theta-mean)^2/(2*sigma^2))}
x = seq(-0.8, 1.2, 0.01)
#prior probability that the new treatment is effective
integrate(prior_f,0, 1.2)
## 0.95 with absolute error < 1.5e-06
#posterior probability that the new treatment is effective
sigma2 <- 0.2266
mean2 <- 0.58
L_Prior <- function(theta) {1/(sigma2*sqrt(2*base::pi))*exp(-(theta-mean2)^2/(2*sigma2^2))*1/(sigma*sqrt(2*base::pi))*exp(-(theta-mean)^2/(2*sigma^2))}
c <- integrate(L_Prior, -1.2, 1.2)$value
posterior_f <- function(theta) {1/(sigma2*sqrt(2*base::pi))*exp(-(theta-mean2)^2/(2*sigma2^2))*1/(sigma*sqrt(2*base::pi))*exp(-(theta-mean)^2/(2*sigma^2))/c}
integrate(posterior_f, 0, 1.5)
## 0.998 with absolute error < 1.1e-09

Chapter13 Markov Chain Monte Carlo Methods

N=10000
x <- rnorm(N,mean=2,sd=1)+rnorm(N,mean=5,sd=0.5)
Fn <- ecdf(x)
Fn(4)
## [1] 0.0036
plot(Fn, verticals = TRUE, do.points = FALSE)

inverse = function (f, lower = -100, upper = 100) {
   function (y) uniroot((function (x) f(x) - y), lower = lower, upper = upper)[[1]]
}

ecdf_inverse = inverse(function (x) Fn(x), 0.1, 100)

ecdf_inverse(0.5)
## [1] 7
u <- runif(N)
thetas <- sapply(u, ecdf_inverse)
hist(thetas)

library(tidyverse)
theta <- seq(-1,7,0.01)
d_theta = 0.5*dnorm(theta,mean=2,sd=1)+0.5*dnorm(theta,mean=5,sd=0.5)

curve(0.5*dnorm(x,mean=2,sd=1)+0.5*dnorm(x,mean=5,sd=0.5), -1, 7, col="sienna", 
    lwd=2,n=1001, ylab="PDF", xlab="theta")
data = tibble('x'=theta, 'y'=d_theta)

data2 <- data[sample(seq_len(nrow(data)), 50, prob=data$y),]
segments(x0 = data2$x,
         y0 = 0,
         x1 = data2$x,
         y1 = data2$y,
         col= 'black',
         lwd = 1)

theta <- seq(-1,7,0.01)
d_theta = 0.5*dnorm(theta,mean=2,sd=1)+0.5*dnorm(theta,mean=5,sd=0.5)

curve(0.5*dnorm(x,mean=2,sd=1)+0.5*dnorm(x,mean=5,sd=0.5), -1, 7, col="sienna", 
    lwd=2,n=1001, ylab="PDF", xlab="theta") # plots the results

data = tibble('x'=theta, 'y'=d_theta)
data2 <- data[sample(seq_len(nrow(data)), 500, prob=data$y),]

segments(x0 = data2$x,
         y0 = 0,
         x1 = data2$x,
         y1 = data2$y,
         col= 'black',
         type="l",
         lwd = 0.1)

data2 <- data[sample(seq_len(nrow(data)), 50, prob=data$y),]
hist(data2$x,
     breaks = seq(from=-1, to=7, by=0.5),
        main="",
        col="white",
        xlab="theta",
     ylab="Counts")
axis(side=1, at=seq(from=-1, to=7, by=1))

data2 <- data[sample(seq_len(nrow(data)), 500, prob=data$y),]
hist(data2$x,
     breaks = seq(from=-1, to=7, by=0.2),
        main="",
        col="white",
        xlab="theta",
     ylab="Counts")
axis(side=1, at=seq(from=-1, to=7, by=1))

data2 <- data[sample(seq_len(nrow(data)), 500, prob=data$y),]
#sample mean
weighted.mean(data2$x,data2$y)
## [1] 3.61
weighted.mean(data$x,data$y)
## [1] 3.5
#True mean
c <- integrate(\(x) (0.5*dnorm(x,mean=2,sd=1)+0.5*dnorm(x,mean=5,sd=0.5)), -10, 10)
density_x <- function(x){
  (dnorm(x,mean=2,sd=1)+dnorm(x,mean=5,sd=0.5))*x/c$value
}
integrate(density_x, -10, 10)$value
## [1] 7
#True variance
c <- integrate(\(x) (dnorm(x,mean=2,sd=1)+dnorm(x,mean=5,sd=0.5)), -10, 10)
density_x_x <- function(x){
  (dnorm(x,mean=2,sd=1)+dnorm(x,mean=5,sd=0.5))*x*x/c$value
}
integrate(density_x_x, -10, 10)$value - (integrate(density_x, -10, 10)$value)^2
## [1] 2.87
(2+5)*0.5
## [1] 3.5
(1^2+(5-3.5)^2)*0.5+(0.5^2+(2-3.5)^2)*0.5
## [1] 2.88
#sample median
data2 <- data[sample(seq_len(nrow(data)), 500, prob=data$y),]
quantile(data2$x, c(0.5))
## 50% 
##   3
#True median
for (i in 1:7) {
  if(abs(integrate(\(x) (0.5*dnorm(x,mean=2,sd=1)+0.5*dnorm(x,mean=5,sd=0.5)), -10, i)$value - 0.5)<0.001){
    print(i)
  }
}
## [1] 4
y=c(6,13,18,28,52,53,61,60)
n=c(59,60,62,56,63,59,62,60)
x=c(1.6907,1.7242,1.7552,1.7842,1.8113,1.8369,1.8610,1.8839)
x_mean = x-mean(x)
n_y=n-y
beetle.mat=cbind(y,n_y)
beetle.mat
##       y n_y
## [1,]  6  53
## [2,] 13  47
## [3,] 18  44
## [4,] 28  28
## [5,] 52  11
## [6,] 53   6
## [7,] 61   1
## [8,] 60   0
beetle
## # A tibble: 8 × 3
##       x     n     y
##   <dbl> <dbl> <dbl>
## 1  1.69    59     6
## 2  1.72    60    13
## 3  1.76    62    18
## 4  1.78    56    28
## 5  1.81    63    52
## 6  1.84    59    53
## 7  1.86    62    61
## 8  1.88    60    60
res.glm1=glm(beetle.mat~x_mean, family=binomial(link="logit"))
res.glm1
## 
## Call:  glm(formula = beetle.mat ~ x_mean, family = binomial(link = "logit"))
## 
## Coefficients:
## (Intercept)       x_mean  
##       0.744       34.270  
## 
## Degrees of Freedom: 7 Total (i.e. Null);  6 Residual
## Null Deviance:	    284 
## Residual Deviance: 11.2 	AIC: 41.4
logLik(res.glm1)
## 'log Lik.' -18.7 (df=2)
#Extreme value model 
res.glm3=glm(beetle.mat~x_mean, family=binomial(link="cloglog"))
res.glm3
## 
## Call:  glm(formula = beetle.mat ~ x_mean, family = binomial(link = "cloglog"))
## 
## Coefficients:
## (Intercept)       x_mean  
##     -0.0431      22.0412  
## 
## Degrees of Freedom: 7 Total (i.e. Null);  6 Residual
## Null Deviance:	    284 
## Residual Deviance: 3.45 	AIC: 33.6
logLik(res.glm3)
## 'log Lik.' -14.8 (df=2)
beta_1 = 0.7438
beta_2 = seq(20,40, 0.2)
mean_x = mean(x)
#pi = exp(beta_1 + beta_2*(x-mean_x))/(1+exp(beta_1 + beta_2*(x-mean_x)))
#pi/(1-pi) = (exp(beta_1 + beta_2*(x-mean_x))/(1+exp(beta_1 + beta_2*(x-mean_x))))/(1/(1+exp(beta_1 + beta_2*(x-mean_x))))
#log(pi/(1-pi)) = beta_1 + beta_2*(x-mean_x)
loglh <- function(beta2){
  sum(y*(beta_1+beta2*x_mean)+n*log(1/(1+exp(beta_1+beta2*x_mean)))+log(choose(n, y)))
}

lh_beta2 = sapply(beta_2, loglh)
lh_beta2
##   [1] -36.2 -35.7 -35.1 -34.6 -34.1 -33.6 -33.1 -32.6 -32.1 -31.6 -31.2 -30.7
##  [13] -30.3 -29.8 -29.4 -29.0 -28.6 -28.2 -27.8 -27.5 -27.1 -26.8 -26.4 -26.1
##  [25] -25.7 -25.4 -25.1 -24.8 -24.5 -24.2 -24.0 -23.7 -23.4 -23.2 -22.9 -22.7
##  [37] -22.5 -22.2 -22.0 -21.8 -21.6 -21.4 -21.2 -21.1 -20.9 -20.7 -20.6 -20.4
##  [49] -20.3 -20.1 -20.0 -19.9 -19.8 -19.7 -19.6 -19.5 -19.4 -19.3 -19.2 -19.1
##  [61] -19.1 -19.0 -19.0 -18.9 -18.9 -18.8 -18.8 -18.8 -18.7 -18.7 -18.7 -18.7
##  [73] -18.7 -18.7 -18.7 -18.7 -18.8 -18.8 -18.8 -18.9 -18.9 -19.0 -19.0 -19.1
##  [85] -19.1 -19.2 -19.2 -19.3 -19.4 -19.5 -19.6 -19.7 -19.8 -19.9 -20.0 -20.1
##  [97] -20.2 -20.3 -20.4 -20.5 -20.7
beta_1 = 0.7438
y=c(6,13,18,28,52,53,61,60)
n=c(59,60,62,56,63,59,62,60)
x=c(1.6907,1.7242,1.7552,1.7842,1.8113,1.8369,1.8610,1.8839)
x_mean = x-mean(x)
n_y=n-y
lh <- function(beta_2){
  prod(exp(y*(beta_1+beta_2*x_mean)+n*log(1/(1+exp(beta_1+beta_2*x_mean)))+log(choose(n, y))))
}


beta2 <- c()
beta2[1] <- 1
for (i in 1:1000) {
  beta2_star <- beta2[i]+rnorm(1)
  alpha <- min((lh(beta2_star)/lh(beta2[i])),1)
  u <- runif(1,min = 0, max = 1)
  beta2[i+1] <- ifelse(alpha>u, beta2_star, beta2[i])
}
plot(beta2)

Chapter14 Example Bayesian Analyses

#Extreme value model 
res.glm3=glm(beetle.mat~x, family=binomial(link="cloglog"))
res.glm3
## 
## Call:  glm(formula = beetle.mat ~ x, family = binomial(link = "cloglog"))
## 
## Coefficients:
## (Intercept)            x  
##       -39.6         22.0  
## 
## Degrees of Freedom: 7 Total (i.e. Null);  6 Residual
## Null Deviance:	    284 
## Residual Deviance: 3.45 	AIC: 33.6
logLik(res.glm3)
## 'log Lik.' -14.8 (df=2)
res.glm3$deviance
## [1] 3.45
#probit model 
res.glm2=glm(beetle.mat~x, family=binomial(link="probit"))
res.glm2
## 
## Call:  glm(formula = beetle.mat ~ x, family = binomial(link = "probit"))
## 
## Coefficients:
## (Intercept)            x  
##       -34.9         19.7  
## 
## Degrees of Freedom: 7 Total (i.e. Null);  6 Residual
## Null Deviance:	    284 
## Residual Deviance: 10.1 	AIC: 40.3
logLik(res.glm2)
## 'log Lik.' -18.2 (df=2)
res.glm2$deviance
## [1] 10.1
# manually calculating the quantile residual
library(statmod)
qresid(res.glm1)
## [1]  1.175  1.209 -1.112 -1.480  0.500 -0.362  1.047  2.006

Let \(y_i\) is the proportion of the success and \(\hat\mu_i\) is the fitted values of the model:

Response: \[y_i - \hat\mu_i\] response residuals are inadequate for assessing a fitted glm, because GLMs are based on distributions where (in general) the variance depends on the mean.

Pearson:

The most direct way to handle the non-constant variance is to divide it out: \[ \frac{y_i - \hat\mu_i}{\sqrt{V(\mu_i)|_{\hat\mu_i}}}\] where \(V()\) is the (GLM) variance function (\(Var(y_i) = a(\phi)*V(\mu_i)\))

Under “Small dispersion asymptotics” conditions, the Pearson residuals have an approximate normal distribution.

Deviance: \[sign(y_i-\hat\mu_i)*\sqrt{d_i}\] where \(d_i\) is the unit deviance, i.e. \[d_i = 2(t(y_i,y_i)-t(y_i,\hat\mu_i))\] For Deviance for a Poisson model: \[sign(o_i-e_i)*\sqrt{d_i}=sign(o_i-e_i)*\sqrt{2[o_i\log(o_i/e_i)-(o_i-e_i)]}, \quad i=1,\dots,N\]

The deviance statistic (sum of squared unit-deviances) has an approximate chi-square distribution (when the saddlepoint approximation applies and under “Small dispersion asymptotics” conditions). Under these same conditions, the deviance residuals have an approximate normal distribution.

Working: \[z_i - \eta_i \] where \(z_i\) are the working responses \(\eta_i + \frac{d\eta_i}{d\mu_i}(y_i-\hat\mu_i)\) and \(\eta_i\) is the linear predictor. Meaning you get that the residual is \(\frac{d\eta_i}{d\mu_i}(y_i-\hat\mu_i)\).

The model coefficients are fitted using Fisher scoring algorithm / Iterative Reweighted Least Square (IRLS). And [it can be shown][1] that each iteration of this algorithm is equivalent to doing ordinary least-squares on the working responses as defined here.

To test the link function - plotting the linear predictor against the working responses should come out linear if the right link function was used.

Partial: \[z_i - \eta_i + X^*\beta\]

where \(X^*\) is the centered \(X\). Partial residuals can be used to determine if a covariate/predictor is on an inappropriate scale.

Quantile: \[\Phi^{-1}(F(y_i))\]

Where \(F(y_i)\) is the CDF of \(y_i\), and \(\Phi^{-1}\) is the quantile function of standard normal (inverse CDF). For discrete \(y_i\)’s you take \(u \sim Unif(F(y_i-1), F(y_i))\) and \(\Phi^{-1}(u)\).

Here is an example code to calculate these residuals:

Y = c(0,0,0,0,1,1,1,1,1)
x1 = c(1,2,3,1,2,2,3,3,3)
x2 = c(1,0,0,1,0,0,0,0,0)
    
fit = glm(Y ~ x1 + x2, family = 'binomial')
    
lp = predict(fit)
mu = exp(lp)/(1+exp(lp))
    
# manually calculating the 1st response residual
resid(fit, type="response")[1]
##         1 
## -8.65e-09
Y[1] - mu[1]
##         1 
## -8.65e-09
# manually calculating the 1st pearson residual
resid(fit, type="pearson")[1]
##        1 
## -9.3e-05
(Y[1]-mu[1]) / sqrt(mu[1]*(1-mu[1]))
##        1 
## -9.3e-05
# manually calculating the 1st deviance residual
resid(fit, type="deviance")[1]
##         1 
## -0.000132
sqrt(-2*log(1-mu[1]))*sign(Y[1]-mu[1])
##         1 
## -0.000132
# manually calculating the 1st working residual
resid(fit, type="working")[1]
##  1 
## -1
(Y[1]-mu[1]) / (mu[1]*(1-mu[1]))
##  1 
## -1
# manually calculating the 1st partial residual
resid(fit, type="partial")[1,1]
## [1] -1.5
(Y[1]-mu[1]) / (mu[1]*(1-mu[1])) + fit$coefficients[2]*(x1[1] - mean(x1))
##    1 
## -1.5
resid(fit, type="partial")[1,2]
## [1] -15.7
(Y[1]-mu[1]) / (mu[1]*(1-mu[1])) + fit$coefficients[3]*(x2[1] - mean(x2))
##     1 
## -15.7
# manually calculating the 1st quantile residual
library(statmod)
qresid(fit)[1] # results are random (uniformly), so won't come the same
## [1] 1.25
a = pbinom(Y[1]-1, 1, mu[1]) 
b = pbinom(Y[1], 1, mu[1])
qnorm(runif(1, a, b)) # results are random (uniformly), so won't come the same
## [1] 0.117
n = 10000
mean(replicate(n, qresid(fit)[1]))
## [1] 0.00822
mean(qnorm(runif(1000, a, b))) # should be close
## [1] 0.00218

reference:

[1]: https://stats.stackexchange.com/questions/1432/what-do-the-residuals-in-a-logistic-regression-mean/485734#485734