134 min read

Modern Statistics for Modern Biology-1

1 Generative Models for Discrete Data

## -----------------------------------------------------------------------------
dpois(x = 3, lambda = 5)
## [1] 0.1403739
## -----------------------------------------------------------------------------
.oldopt = options(digits = 2)
0:12
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12
dpois(x = 0:12, lambda = 5)
##  [1] 0.0067 0.0337 0.0842 0.1404 0.1755 0.1755 0.1462 0.1044 0.0653 0.0363
## [11] 0.0181 0.0082 0.0034
barplot(dpois(0:12, 5), names.arg = 0:12, col = "red")

options(.oldopt)
## -----------------------------------------------------------------------------
genotype = c("AA","AO","BB","AO","OO","AO","AA","BO","BO",
             "AO","BB","AO","BO","AB","OO","AB","BB","AO","AO")
table(genotype)
## genotype
## AA AB AO BB BO OO 
##  2  2  7  3  3  2
## -----------------------------------------------------------------------------
genotypeF = factor(genotype)
levels(genotypeF)
## [1] "AA" "AB" "AO" "BB" "BO" "OO"
table(genotypeF)
## genotypeF
## AA AB AO BB BO OO 
##  2  2  7  3  3  2
## -----------------------------------------------------------------------------
rbinom(15, prob = 0.5, size = 1)
##  [1] 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0
## -----------------------------------------------------------------------------
rbinom(12, prob = 2/3, size = 1)
##  [1] 1 1 1 0 0 0 1 1 0 0 1 1
## -----------------------------------------------------------------------------
rbinom(1, prob = 2/3, size = 12)
## [1] 9
## -----------------------------------------------------------------------------
set.seed(235569515)
rbinom(1, prob = 0.3, size = 15)
## [1] 5
## -----------------------------------------------------------------------------
probabilities = dbinom(0:15, prob = 0.3, size = 15)
round(probabilities, 2)
##  [1] 0.00 0.03 0.09 0.17 0.22 0.21 0.15 0.08 0.03 0.01 0.00 0.00 0.00 0.00 0.00
## [16] 0.00
## -----------------------------------------------------------------------------
barplot(probabilities, names.arg = 0:15, col = "red")

## -----------------------------------------------------------------------------
plot(dbinom(0:12, prob = 5e-4, size = 1e4), dpois(0:12, lambda = 5), asp = 1)
abline(a = 0, b = 1, col = "blue")

## -----------------------------------------------------------------------------
5^3 * exp(-5) / factorial(3)
## [1] 0.1403739
## -----------------------------------------------------------------------------
rbinom(1, prob = 5e-4, size = 10000)
## [1] 6
simulations = rbinom(n = 300000, prob = 5e-4, size = 10000)
barplot(table(simulations), col = "lavender")

## -----------------------------------------------------------------------------
`[<-`(rep(0, 100), 22, 1)
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## -----------------------------------------------------------------------------
s100 = rpois(100, lambda=0.5)
s100
##   [1] 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 2 0 0
##  [38] 0 1 2 0 1 0 1 2 0 2 0 1 2 1 0 1 0 0 0 0 0 1 0 0 1 2 0 0 0 1 0 1 2 0 0 0 0
##  [75] 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 4 3 1
barplot(s100, ylim = c(0, 7), width = 0.7, xlim = c(-0.5,100.5),
  names.arg = seq(along = s100), col="lavender")

## -----------------------------------------------------------------------------
## set.seed(8969311)
## e100 = rpois(100,lambda = 0.5)
## e100[42] = 7
## save(e100, file = "../data/e100.RData")

## -----------------------------------------------------------------------------
setwd("D:/books/R/blogdown/hugo")
load("../data/e100.RData")
barplot(e100, ylim = c(0, 7), width = 0.7, xlim = c(-0.5, 100.5),
  names.arg = seq(along = e100), col = "darkolivegreen")

## -----------------------------------------------------------------------------
barplot(e100, ylim = c(0, 7), width = 0.7, xlim = c(-0.5, 100.5),
  names.arg = seq(along = e100), col = "darkolivegreen")
text(35, 7, adj = c(-0.05, 0.5), labels = "?", xpd = NA, col = "red",
  cex = 1.25, font = 2)

## -----------------------------------------------------------------------------
1 - ppois(6, 0.5)
## [1] 1.00238e-06
ppois(6, 0.5, lower.tail = FALSE)
## [1] 1.00238e-06
## -----------------------------------------------------------------------------
maxes = replicate(100000, {
  max(rpois(100, 0.5))
})
table(maxes)
## maxes
##     1     2     3     4     5     6     7     9 
##     7 23028 60840 14364  1604   141    15     1
## -----------------------------------------------------------------------------
mean( maxes >= 7 )
## [1] 0.00016
## -----------------------------------------------------------------------------
dmultinom(c(4, 2, 0, 0), prob = rep(1/4, 4))
## [1] 0.003662109
## -----------------------------------------------------------------------------
pvec = rep(1/4, 4)
t(rmultinom(1, prob = pvec, size = 8))
##      [,1] [,2] [,3] [,4]
## [1,]    1    3    1    3
## -----------------------------------------------------------------------------
pvec = rep(1/4, 4)
t(rmultinom(1, prob = pvec, size = 80))
##      [,1] [,2] [,3] [,4]
## [1,]   23   22   15   20
## -----------------------------------------------------------------------------
pvec = rep(1/4, 4)
t(rmultinom(8, prob = pvec, size = 1))
##      [,1] [,2] [,3] [,4]
## [1,]    0    0    0    1
## [2,]    1    0    0    0
## [3,]    0    0    1    0
## [4,]    1    0    0    0
## [5,]    0    0    0    1
## [6,]    1    0    0    0
## [7,]    1    0    0    0
## [8,]    0    0    0    1
## -----------------------------------------------------------------------------
obsunder0 = rmultinom(1000, prob = pvec, size = 20)
dim(obsunder0)
## [1]    4 1000
obsunder0[, 1:11]
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
## [1,]    7    4    3    3    4    5    1    2    7     4     5
## [2,]    4    9    3    6    6    2    6    8    5     5     4
## [3,]    6    4    6    5    3    9    5    5    4     3     4
## [4,]    3    3    8    6    7    4    8    5    4     8     7
## -----------------------------------------------------------------------------
thep = unique(pvec); stopifnot(length(thep)==1, thep == 0.25)
thep
## [1] 0.25
## -----------------------------------------------------------------------------
expected0 = pvec * 20
sum((obsunder0[, 1] - expected0)^2 / expected0)
## [1] 2
sum((obsunder0[, 2] - expected0)^2 / expected0)
## [1] 4.4
sum((obsunder0[, 3] - expected0)^2 / expected0)
## [1] 3.6
(5-5)^2/5+(4-5)^2/5+(6-5)^2/5+(5-5)^2/5
## [1] 0.4
(4-5)^2/5+(6-5)^2/5+(6-5)^2/5+(4-5)^2/5
## [1] 0.8
(6-5)^2/5+(4-5)^2/5+(4-5)^2/5+(6-5)^2/5
## [1] 0.8
## -----------------------------------------------------------------------------
stat = function(obsvd, exptd = 20 * pvec) {
  sum((obsvd - exptd)^2 / exptd)
}
stat(obsunder0[, 1])
## [1] 2
## -----------------------------------------------------------------------------
S0 = apply(obsunder0, 2, stat)
summary(S0)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.200   2.800   3.086   4.000  16.400
hist(S0, breaks = 25, col = "lavender", main = "")

## -----------------------------------------------------------------------------
q95 = quantile(S0, probs = 0.95)
q95
## 95% 
## 7.6
## -----------------------------------------------------------------------------
## ## This was done to save this object for its reuse in Chapter 2.
## save(S0, file = "../data/S0.RData")

## -----------------------------------------------------------------------------
pvecA = c(3/8, 1/4, 1/4, 1/8)
observed = rmultinom(1000, prob = pvecA, size = 20)
dim(observed)
## [1]    4 1000
observed[, 1:7]
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,]    3    6    7    9   15   10    9
## [2,]    9    3    5    3    2    5    6
## [3,]    7    9    4    4    3    3    2
## [4,]    1    2    4    4    0    2    3
apply(observed, 1, mean)
## [1] 7.459 4.926 5.100 2.515
expectedA = pvecA * 20
expectedA
## [1] 7.5 5.0 5.0 2.5
## -----------------------------------------------------------------------------
stat(observed[, 1])
## [1] 8
S1 = apply(observed, 2, stat)
q95 # can't reject the first observation, because the statistic is within the 95% percentile
## 95% 
## 7.6
sum(S1 > q95)
## [1] 197
power = mean(S1 > q95)
power
## [1] 0.197
## -----------------------------------------------------------------------------
#stopifnot(stat(observed[, 1]) < q95)
## -----------------------------------------------------------------------------
dbinom(2, size = 10, prob = 0.3) # run size = 10, prob = 0.3 binomial distribution, the probability at 2
## [1] 0.2334744
pbinom(2, size = 10, prob = 0.3) # run size = 10, prob = 0.3 binomial distribution, the probability <= 2
## [1] 0.3827828
sum(dbinom(0:2, size = 10, prob = 0.3)) 
## [1] 0.3827828

Whenever we note that we keep needing a certain sequence of commands, it’s good to put them into a function. The function body contains the instructions that we want to do over and over again, the function arguments take those things that we may want to vary. Write a function to compute the probability of having a maximum as big as m when looking across n Poisson variables with rate lambda.

## -----------------------------------------------------------------------------
poismax = function(lambda, n, m) {
  epsilon = 1 - ppois(m - 1, lambda)
  1 - exp( -n * epsilon)
}
poismax(lambda = 0.5, n = 100, m = 7)
## [1] 0.0001002329
poismax(lambda = mean(e100), n = 100, m = 7)
## [1] 0.0001870183
## -----------------------------------------------------------------------------
poismax = function(lambda, n = 100, m = 7) {
  1 - exp( -n * (1 - ppois(m - 1, lambda)))
}
poismax(0.5)
## [1] 0.0001002329
poismax(0.5, m = 9)
## [1] 3.43549e-07
## -----------------------------------------------------------------------------
## if (!requireNamespace("BiocManager", quietly = TRUE))
##     install.packages("BiocManager")
## BiocManager::install(c("Biostrings", "BSgenome.Celegans.UCSC.ce2"))

## -----------------------------------------------------------------------------
library("BSgenome.Celegans.UCSC.ce2")
## Loading required package: BSgenome
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: rtracklayer
Celegans
## Worm genome:
## # organism: Caenorhabditis elegans (Worm)
## # genome: ce2
## # provider: UCSC
## # release date: Mar. 2004
## # 7 sequences:
## #   chrI   chrII  chrIII chrIV  chrV   chrX   chrM                              
## # (use 'seqnames()' to see all the sequence names, use the '$' or '[[' operator
## # to access a given sequence)
seqnames(Celegans)
## [1] "chrI"   "chrII"  "chrIII" "chrIV"  "chrV"   "chrX"   "chrM"
Celegans$chrM
## 13794-letter DNAString object
## seq: CAGTAAATAGTTTAATAAAAATATAGCATTTGGGTT...TATTTATAGATATATACTTTGTATATATCTATATTA
class(Celegans$chrM)
## [1] "DNAString"
## attr(,"package")
## [1] "Biostrings"
length(Celegans$chrM)
## [1] 13794
## -----------------------------------------------------------------------------
library("Biostrings")
lfM = letterFrequency(Celegans$chrM, letters=c("A", "C", "G", "T"))
lfM
##    A    C    G    T 
## 4335 1225 2055 6179
sum(lfM)
## [1] 13794
lfM / sum(lfM)
##          A          C          G          T 
## 0.31426707 0.08880673 0.14897782 0.44794838
## -----------------------------------------------------------------------------
t(rmultinom(1, length(Celegans$chrM), p = rep(1/4, 4)))
##      [,1] [,2] [,3] [,4]
## [1,] 3463 3397 3464 3470
## -----------------------------------------------------------------------------
length(Celegans$chrM) / 4
## [1] 3448.5
## -----------------------------------------------------------------------------
oestat = function(o, e) {
  sum((o-e)^2 / e)
}
oe = oestat(o = lfM, e = length(Celegans$chrM) / 4)
oe
## [1] 4386.634
## -----------------------------------------------------------------------------
B = 10000
n = length(Celegans$chrM)
expected = rep(n / 4, 4)
oenull = replicate(B,
  oestat(e = expected, o = rmultinom(1, n, p = rep(1/4, 4))))


## -----------------------------------------------------------------------------
hist(oenull, breaks = 100, col = "skyblue", main = "")

q95 = quantile(oenull, probs = 0.95)
q95
##     95% 
## 7.84544
mean(oenull > 10)
## [1] 0.0178
?Distributions
## starting httpd help server ... done

For the Cauchy distribution see dcauchy.

For the chi-squared distribution see dchisq.

For the exponential distribution see dexp.

For the F distribution see df.

For the gamma distribution see dgamma.

For the geometric distribution see dgeom. (This is also a special case of the negative binomial.)

For the hypergeometric distribution see dhyper.

For the log-normal distribution see dlnorm.

For the multinomial distribution see dmultinom.

For the negative binomial distribution see dnbinom.

For the normal distribution see dnorm.

For the Poisson distribution see dpois.

For the Student’s t distribution see dt.

For the uniform distribution see dunif.

For the Weibull distribution see dweibull.

posiv = rpois(n=1000, lambda=3)
mean(posiv)
## [1] 3.037
var(posiv)
## [1] 3.056688

2 Statistical Modeling

rpois(100, 0.5)
##   [1] 1 0 0 1 1 0 1 2 0 0 0 0 2 0 1 1 1 1 1 0 1 0 0 1 1 0 0 1 3 1 1 1 1 1 1 1 1
##  [38] 0 0 1 0 1 0 0 1 0 0 0 0 0 0 2 1 0 0 1 0 1 0 2 0 0 0 0 0 0 1 0 0 1 0 0 0 0
##  [75] 1 0 2 1 2 0 4 0 0 2 1 2 1 1 0 1 1 1 1 0 0 0 0 0 0 0
poismax = function(lambda, n, m) {
  epsilon = 1 - ppois(m - 1, lambda)
  1 - exp( -n * epsilon)
}
poismax(lambda = 0.5, n = 100, m = 7)
## [1] 0.0001002329
## -----------------------------------------------------------------------------
setwd("D:/books/R/blogdown/hugo")
load("../data/e100.RData")
e99 = e100[-which.max(e100)]
e100
##   [1] 2 0 1 0 0 0 2 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 1 1
##  [38] 0 1 2 2 7 1 0 2 0 1 0 1 1 1 1 0 1 0 0 0 0 1 2 2 1 0 0 0 0 0 1 0 0 0 1 1 0
##  [75] 0 1 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 0 1 1 0 0 1 0
e99
##  [1] 2 0 1 0 0 0 2 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 1 1 0
## [39] 1 2 2 1 0 2 0 1 0 1 1 1 1 0 1 0 0 0 0 1 2 2 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1 1
## [77] 0 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 0 1 1 0 0 1 0
which.max(e100)
## [1] 42
## -----------------------------------------------------------------------------
barplot(table(e99), space = 0.8, col = "chartreuse4")

## -----------------------------------------------------------------------------
library("vcd")
## Loading required package: grid
## 
## Attaching package: 'grid'
## The following object is masked from 'package:Biostrings':
## 
##     pattern
## 
## Attaching package: 'vcd'
## The following object is masked from 'package:GenomicRanges':
## 
##     tile
## The following object is masked from 'package:IRanges':
## 
##     tile
gf1 = goodfit( e99, "poisson")
rootogram(gf1, xlab = "", rect_gp = gpar(fill = "chartreuse4"))

## -----------------------------------------------------------------------------
simp = rpois(100, lambda = 0.05)
gf2 = goodfit(simp, "poisson")
rootogram(gf2, xlab = "")

## -----------------------------------------------------------------------------
table(e100)
## e100
##  0  1  2  7 
## 58 34  7  1
## -----------------------------------------------------------------------------
table(rpois(100, 3))
## 
##  0  1  2  3  4  5  6  8  9 
##  2 19 24 17 19 10  4  4  1
## -----------------------------------------------------------------------------
counts  =  table(e100)
stopifnot(identical(names(counts), c("0", "1", "2", "7")), all(counts==c(58, 34, 7, 1)))
## -----------------------------------------------------------------------------
prod(dpois(c(0, 1, 2, 7), lambda = 3) ^ (c(58, 34, 7, 1)))
## [1] 1.392143e-110
## -----------------------------------------------------------------------------
prod(dpois(c(0, 1, 2, 7), lambda = 0.4) ^ (c(58, 34, 7, 1)))
## [1] 8.5483e-46
## -----------------------------------------------------------------------------
prod(dpois(c(0, 1, 2, 7), lambda = 0) ^ (c(58, 34, 7, 1)))
## [1] 0
## -----------------------------------------------------------------------------
loglikelihood  =  function(lambda, data = e100) {
  sum(log(dpois(data, lambda)))
}

## -----------------------------------------------------------------------------
lambdas = seq(0.05, 0.95, length = 100)
loglik = vapply(lambdas, loglikelihood, numeric(1))
plot(lambdas, loglik, type = "l", col = "red", ylab = "", lwd = 2,
     xlab = expression(lambda))
m0 = mean(e100)
abline(v = m0, col = "blue", lwd = 2)
abline(h = loglikelihood(m0), col = "purple", lwd = 2)

m0
## [1] 0.55
## -----------------------------------------------------------------------------
gf  =  goodfit(e100, "poisson")
names(gf)
## [1] "observed" "count"    "fitted"   "type"     "method"   "df"       "par"
gf$par
## $lambda
## [1] 0.55
## -----------------------------------------------------------------------------
cb  =  c(rep(0, 110), rep(1, 10))

## -----------------------------------------------------------------------------
table(cb)
## cb
##   0   1 
## 110  10
## -----------------------------------------------------------------------------
mean(cb)
## [1] 0.08333333
## -----------------------------------------------------------------------------
probs  =  seq(0, 0.3, by = 0.001)
likelihood = dbinom(sum(cb), prob = probs, size = length(cb))
plot(probs, likelihood, pch = 16, xlab = "probability of success",
       ylab = "likelihood", cex=0.6)

probs[which.max(likelihood)]
## [1] 0.083
## -----------------------------------------------------------------------------
stopifnot(abs(probs[which.max(likelihood)]-1/12) < diff(probs[1:2]))
## -----------------------------------------------------------------------------
loglikelihood = function(p, n = 300, y = 40) {
  log(choose(n, y)) + y * log(p) + (n - y) * log(1 - p)
}

## -----------------------------------------------------------------------------
p_seq = seq(0, 1, by = 0.001)
plot(p_seq, loglikelihood(p_seq), xlab = "p", ylab = "log f(p|y)", type = "l")

## -----------------------------------------------------------------------------
library("Biostrings")
setwd("D:/books/R/blogdown/hugo")
staph = readDNAStringSet("../data/staphsequence.ffn.txt", "fasta")

## -----------------------------------------------------------------------------
staph[1]
## DNAStringSet object of length 1:
##     width seq                                               names               
## [1]  1362 ATGTCGGAAAAAGAAATTTGGGA...AAAAAGAAATAAGAAATGTATAA lcl|NC_002952.2_c...

The double square brackets [[i]] extract the sequence of the i-th gene as a DNAString, as opposed to the pair of single brackets [i], which return a DNAStringSet with just a single DNAString in it. If you look at the length of staph[1], it is 1, whereas staph[[1]] has length 1362.

letterFrequency(staph[[1]], letters = "ACGT", OR = 0)
##   A   C   G   T 
## 522 219 229 392
length(staph[1])
## [1] 1
length(staph[[1]])
## [1] 1362
## -----------------------------------------------------------------------------
letterFrq = vapply(staph, letterFrequency, FUN.VALUE = numeric(4),
         letters = "ACGT", OR = 0)
colnames(letterFrq) = paste0("gene", seq(along = staph))
tab10 = letterFrq[, 1:10]
tab10
##   gene1 gene2 gene3 gene4 gene5 gene6 gene7 gene8 gene9 gene10
## A   522   413    85   411   685   887   275   510   487    191
## C   219   176    31   168   293   395   137   244   180    111
## G   229   193    56   207   423   586   169   316   263    142
## T   392   352    74   327   531   793   250   445   357    252
computeProportions = function(x) { x/sum(x) }
prop10 = apply(tab10, 2, computeProportions) # nucleotides Proportions in column
round(prop10, digits = 2)
##   gene1 gene2 gene3 gene4 gene5 gene6 gene7 gene8 gene9 gene10
## A  0.38  0.36  0.35  0.37  0.35  0.33  0.33  0.34  0.38   0.27
## C  0.16  0.16  0.13  0.15  0.15  0.15  0.16  0.16  0.14   0.16
## G  0.17  0.17  0.23  0.19  0.22  0.22  0.20  0.21  0.20   0.20
## T  0.29  0.31  0.30  0.29  0.27  0.30  0.30  0.29  0.28   0.36
p0 = rowMeans(prop10) # nucleotides Proportions in all 10 genes
p0
##         A         C         G         T 
## 0.3470531 0.1518313 0.2011442 0.2999714
## -----------------------------------------------------------------------------
cs = colSums(tab10) # 10 genes length
cs
##  gene1  gene2  gene3  gene4  gene5  gene6  gene7  gene8  gene9 gene10 
##   1362   1134    246   1113   1932   2661    831   1515   1287    696
expectedtab10 = outer(p0, cs, FUN = "*") # expected nucleotides Proportions if every gene has the same Proportion
round(expectedtab10)
##   gene1 gene2 gene3 gene4 gene5 gene6 gene7 gene8 gene9 gene10
## A   473   394    85   386   671   924   288   526   447    242
## C   207   172    37   169   293   404   126   230   195    106
## G   274   228    49   224   389   535   167   305   259    140
## T   409   340    74   334   580   798   249   454   386    209
## -----------------------------------------------------------------------------
randomtab10 = sapply(cs, function(s) { rmultinom(1, s, p0) } )
randomtab10
##      gene1 gene2 gene3 gene4 gene5 gene6 gene7 gene8 gene9 gene10
## [1,]   464   388    82   384   674   930   322   494   413    226
## [2,]   231   175    48   166   313   425   118   243   179    116
## [3,]   260   234    51   230   380   555   160   341   296    137
## [4,]   407   337    65   333   565   751   231   437   399    217
all(colSums(randomtab10) == cs)
## [1] TRUE
## -----------------------------------------------------------------------------
stopifnot(all(colSums(randomtab10) == cs))
## -----------------------------------------------------------------------------
stat = function(obsvd, exptd) {
   sum((obsvd - exptd)^2 / exptd)
}
B = 1000
simulstat = replicate(B, {
  randomtab10 = sapply(cs, function(s) { rmultinom(1, s, p0) })
  stat(randomtab10, expectedtab10)
})
S1 = stat(tab10, expectedtab10)
sum(simulstat >= S1)
## [1] 0
length(simulstat)
## [1] 1000
head(simulstat)
## [1] 40.43308 25.31234 27.95283 17.37396 34.72065 29.15233
expectedtab10
##      gene1    gene2    gene3    gene4    gene5    gene6    gene7    gene8
## A 472.6864 393.5582 85.37507 386.2701 670.5066 923.5084 288.4011 525.7855
## C 206.7942 172.1767 37.35049 168.9882 293.3380 404.0230 126.1718 230.0244
## G 273.9584 228.0975 49.48147 223.8735 388.6105 535.2446 167.1508 304.7334
## T 408.5611 340.1676 73.79297 333.8682 579.5448 798.2239 249.2762 454.4567
##      gene9   gene10
## A 446.6574 241.5490
## C 195.4069 105.6746
## G 258.8726 139.9963
## T 386.0632 208.7801
randomtab10
##      gene1 gene2 gene3 gene4 gene5 gene6 gene7 gene8 gene9 gene10
## [1,]   464   388    82   384   674   930   322   494   413    226
## [2,]   231   175    48   166   313   425   118   243   179    116
## [3,]   260   234    51   230   380   555   160   341   296    137
## [4,]   407   337    65   333   565   751   231   437   399    217
hist(simulstat, col = "lavender", breaks = seq(0, 75, length.out=50))
abline(v = S1, col = "red")
abline(v = quantile(simulstat, probs = c(0.95, 0.99)),
       col = c("darkgreen", "blue"), lty = 2)

## -----------------------------------------------------------------------------
stopifnot(max(simulstat)<75, S1<75)
## -----------------------------------------------------------------------------
qs = ppoints(100)  #Generates the sequence of probability points 0.005 to 0.995
quantile(simulstat, qs)
##     0.5%     1.5%     2.5%     3.5%     4.5%     5.5%     6.5%     7.5% 
## 14.98844 16.28320 17.50259 18.24076 18.62344 19.07341 19.66079 19.91208 
##     8.5%     9.5%    10.5%    11.5%    12.5%    13.5%    14.5%    15.5% 
## 20.29684 20.62778 20.93192 21.10932 21.40516 21.64190 21.85037 22.13960 
##    16.5%    17.5%    18.5%    19.5%    20.5%    21.5%    22.5%    23.5% 
## 22.53136 22.93293 23.14039 23.41548 23.60116 23.95936 24.17024 24.42954 
##    24.5%    25.5%    26.5%    27.5%    28.5%    29.5%    30.5%    31.5% 
## 24.69359 24.92622 24.99886 25.16888 25.33169 25.50434 25.62677 25.83369 
##    32.5%    33.5%    34.5%    35.5%    36.5%    37.5%    38.5%    39.5% 
## 25.98940 26.28855 26.63070 26.81300 26.98819 27.12084 27.30601 27.47563 
##    40.5%    41.5%    42.5%    43.5%    44.5%    45.5%    46.5%    47.5% 
## 27.59761 27.78386 28.02836 28.18457 28.51322 28.66197 28.88965 29.00322 
##    48.5%    49.5%    50.5%    51.5%    52.5%    53.5%    54.5%    55.5% 
## 29.19831 29.43226 29.75325 29.87609 30.04020 30.18108 30.37550 30.63155 
##    56.5%    57.5%    58.5%    59.5%    60.5%    61.5%    62.5%    63.5% 
## 30.83079 31.01395 31.22237 31.45265 31.74654 31.90807 32.30407 32.55708 
##    64.5%    65.5%    66.5%    67.5%    68.5%    69.5%    70.5%    71.5% 
## 32.80945 32.95201 33.29111 33.50298 33.80705 33.94406 34.16076 34.33740 
##    72.5%    73.5%    74.5%    75.5%    76.5%    77.5%    78.5%    79.5% 
## 34.55167 34.79766 35.08447 35.45335 35.71938 35.85952 36.19083 36.42843 
##    80.5%    81.5%    82.5%    83.5%    84.5%    85.5%    86.5%    87.5% 
## 36.75272 37.05766 37.49101 37.94178 38.20867 38.60341 38.92721 39.44580 
##    88.5%    89.5%    90.5%    91.5%    92.5%    93.5%    94.5%    95.5% 
## 39.92408 40.43715 41.02038 41.56662 42.35350 43.56930 44.39798 44.93912 
##    96.5%    97.5%    98.5%    99.5% 
## 45.83205 48.84615 50.06561 54.04509
qchisq(qs, df = 30)
##   [1] 13.78672 15.71876 16.79077 17.57614 18.21194 18.75446 19.23275 19.66390
##   [9] 20.05886 20.42512 20.76806 21.09166 21.39894 21.69231 21.97367 22.24457
##  [17] 22.50629 22.75990 23.00632 23.24631 23.48056 23.70963 23.93405 24.15428
##  [25] 24.37070 24.58370 24.79358 25.00066 25.20519 25.40743 25.60760 25.80592
##  [33] 26.00258 26.19776 26.39164 26.58439 26.77614 26.96707 27.15729 27.34696
##  [41] 27.53620 27.72513 27.91389 28.10259 28.29136 28.48031 28.66957 28.85924
##  [49] 29.04945 29.24032 29.43196 29.62450 29.81806 30.01277 30.20876 30.40616
##  [57] 30.60512 30.80577 31.00828 31.21280 31.41949 31.62854 31.84012 32.05444
##  [65] 32.27170 32.49213 32.71597 32.94349 33.17495 33.41067 33.65098 33.89624
##  [73] 34.14685 34.40324 34.66590 34.93536 35.21223 35.49719 35.79099 36.09449
##  [81] 36.40869 36.73473 37.07391 37.42780 37.79820 38.18728 38.59768 39.03260
##  [89] 39.49603 39.99300 40.53003 41.11573 41.76190 42.48522 43.31057 44.27745
##  [97] 45.45461 46.97924 49.19886 53.67196
hist(rchisq(1000, df = 30), col = "lavender", breaks = 50)

quantile(qchisq(qs, df = 30), qs)
##     0.5%     1.5%     2.5%     3.5%     4.5%     5.5%     6.5%     7.5% 
## 14.74308 16.23869 17.16382 17.87179 18.45879 18.96730 19.42030 19.83175 
##     8.5%     9.5%    10.5%    11.5%    12.5%    13.5%    14.5%    15.5% 
## 20.21086 20.56401 20.89588 21.20996 21.50896 21.79501 22.06984 22.33486 
##    16.5%    17.5%    18.5%    19.5%    20.5%    21.5%    22.5%    23.5% 
## 22.59125 22.83999 23.08192 23.31776 23.54813 23.77359 23.99461 24.21163 
##    24.5%    25.5%    26.5%    27.5%    28.5%    29.5%    30.5%    31.5% 
## 24.42502 24.63512 24.84224 25.04668 25.24867 25.44846 25.64627 25.84230 
##    32.5%    33.5%    34.5%    35.5%    36.5%    37.5%    38.5%    39.5% 
## 26.03673 26.22975 26.42152 26.61219 26.80192 26.99084 27.17910 27.36683 
##    40.5%    41.5%    42.5%    43.5%    44.5%    45.5%    46.5%    47.5% 
## 27.55414 27.74118 27.92804 28.11486 28.30175 28.48883 28.67621 28.86400 
##    48.5%    49.5%    50.5%    51.5%    52.5%    53.5%    54.5%    55.5% 
## 29.05231 29.24127 29.43100 29.62161 29.81322 30.00595 30.19994 30.39530 
##    56.5%    57.5%    58.5%    59.5%    60.5%    61.5%    62.5%    63.5% 
## 30.59219 30.79073 30.99107 31.19337 31.39779 31.60450 31.81367 32.02550 
##    64.5%    65.5%    66.5%    67.5%    68.5%    69.5%    70.5%    71.5% 
## 32.24019 32.45796 32.67904 32.90367 33.13213 33.36471 33.60172 33.84351 
##    72.5%    73.5%    74.5%    75.5%    76.5%    77.5%    78.5%    79.5% 
## 34.09046 34.34299 34.60155 34.86665 35.13886 35.41883 35.70725 36.00496 
##    80.5%    81.5%    82.5%    83.5%    84.5%    85.5%    86.5%    87.5% 
## 36.31286 36.63203 36.96368 37.30925 37.67041 38.04916 38.44789 38.86951 
##    88.5%    89.5%    90.5%    91.5%    92.5%    93.5%    94.5%    95.5% 
## 39.31761 39.79669 40.31253 40.87266 41.48728 42.17057 42.94329 43.83752 
##    96.5%    97.5%    98.5%    99.5% 
## 44.90723 46.25504 48.12235 51.45778
## -----------------------------------------------------------------------------
qqplot(qchisq(ppoints(B), df = 30), simulstat, main = "",
  xlab = expression(chi[nu==30]^2), asp = 1, cex = 0.5, pch = 16)
abline(a = 0, b = 1, col = "red")

## -----------------------------------------------------------------------------
1 - pchisq(S1, df = 30)
## [1] 4.74342e-05
## -----------------------------------------------------------------------------
setwd("D:/books/R/blogdown/hugo")
load("../data/ChargaffTable.RData")
ChargaffTable
##                   A    T    C    G
## Human-Thymus   30.9 29.4 19.9 19.8
## Mycobac.Tuber  15.1 14.6 34.9 35.4
## Chicken-Eryth. 28.8 29.2 20.5 21.5
## Sheep-liver    29.3 29.3 20.5 20.7
## Sea Urchin     32.8 32.1 17.7 17.3
## Wheat          27.3 27.1 22.7 22.8
## Yeast          31.3 32.9 18.7 17.1
## E.coli         24.7 23.6 26.0 25.7
## -----------------------------------------------------------------------------
stopifnot(nrow(ChargaffTable) == 8)
mycolors = c("chocolate", "aquamarine4", "cadetblue4", "coral3",
            "chartreuse4","darkgoldenrod4","darkcyan","brown4")
par(mfrow=c(2, 4), mai = c(0, 0.7, 0.7, 0))
for (i in 1:8) {
  cbp = barplot(ChargaffTable[i, ], horiz = TRUE, axes = FALSE, axisnames = FALSE, col = mycolors[i])
  ax = axis(3, las = 2, labels = FALSE, col = mycolors[i], cex = 0.5, at = c(0, 10, 20))
  mtext(side = 3, at = ax,  text = paste(ax), col = mycolors[i], line = 0, las = 1, cex = 0.9)
  mtext(side = 2, at = cbp, text = colnames(ChargaffTable), col = mycolors[i], line = 0, las = 2, cex = 1)
  title(paste(rownames(ChargaffTable)[i]), col = mycolors[i], cex = 1.1)
}

## -----------------------------------------------------------------------------
statChf = function(x){
  sum((x[, "C"] - x[, "G"])^2 + (x[, "A"] - x[, "T"])^2)
}
chfstat = statChf(ChargaffTable)
chfstat
## [1] 11.08
permstat = replicate(100000, {
     permuted = t(apply(ChargaffTable, 1, sample))
     colnames(permuted) = colnames(ChargaffTable)
     statChf(permuted)
})
pChf = mean(permstat <= chfstat)
pChf
## [1] 0.00012
hist(permstat, breaks = 100, main = "", col = "lavender")
abline(v = chfstat, lwd = 2, col = "red")

## -----------------------------------------------------------------------------
HairEyeColor[,, "Female"]
##        Eye
## Hair    Brown Blue Hazel Green
##   Black    36    9     5     2
##   Brown    66   34    29    14
##   Red      16    7     7     7
##   Blond     4   64     5     8
## -----------------------------------------------------------------------------
str(HairEyeColor)
##  'table' num [1:4, 1:4, 1:2] 32 53 10 3 11 50 10 30 10 25 ...
##  - attr(*, "dimnames")=List of 3
##   ..$ Hair: chr [1:4] "Black" "Brown" "Red" "Blond"
##   ..$ Eye : chr [1:4] "Brown" "Blue" "Hazel" "Green"
##   ..$ Sex : chr [1:2] "Male" "Female"
?HairEyeColor
## -----------------------------------------------------------------------------
setwd("D:/books/R/blogdown/hugo")
load("../data/Deuteranopia.RData")
Deuteranopia
##           Men Women
## Deute      19     2
## NonDeute 1981  1998
## -----------------------------------------------------------------------------
test <- chisq.test(Deuteranopia)
test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Deuteranopia
## X-squared = 12.255, df = 1, p-value = 0.0004641
test$statistic # test statistic
## X-squared 
##  12.25481
test$p.value # p-value
## [1] 0.0004640594
test$observed
##           Men Women
## Deute      19     2
## NonDeute 1981  1998
test$expected
##             Men  Women
## Deute      10.5   10.5
## NonDeute 1989.5 1989.5
(19+2)/2
## [1] 10.5
(1981+1998)/2
## [1] 1989.5
# Yates' continuity correction is subtract 0.5 from the absolute value 
(19-10.5-0.5)^2/10.5+(abs(2-10.5)-0.5)^2/10.5+(abs(1981-1989.5)-0.5)^2/1989.5+(1998-1989.5-0.5)^2/1989.5
## [1] 12.25481
test$method
## [1] "Pearson's Chi-squared test with Yates' continuity correction"
test$residuals
##                 Men      Women
## Deute     2.6231569 -2.6231569
## NonDeute -0.1905667  0.1905667
## -----------------------------------------------------------------------------
library("HardyWeinberg")
## Loading required package: mice
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:IRanges':
## 
##     cbind, rbind
## The following objects are masked from 'package:S4Vectors':
## 
##     cbind, rbind
## The following objects are masked from 'package:BiocGenerics':
## 
##     cbind, rbind
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Loading required package: Rsolnp
## Loading required package: nnet
## 
## Attaching package: 'HardyWeinberg'
## The following object is masked from 'package:S4Vectors':
## 
##     fold
data("Mourant")
Mourant[214:216,]
##     Population    Country Total  MM  MN  NN
## 214    Oceania Micronesia   962 228 436 298
## 215    Oceania Micronesia   678  36 229 413
## 216    Oceania     Tahiti   580 188 296  96
nMM = Mourant$MM[216]
nMN = Mourant$MN[216]
nNN = Mourant$NN[216]
loglik = function(p, q = 1 - p) {
  2 * nMM * log(p) + nMN * log(2*p*q) + 2 * nNN * log(q)
}
xv = seq(0.01, 0.99, by = 0.01)
yv = loglik(xv)
plot(x = xv, y = yv, type = "l", lwd = 2,
     xlab = "p", ylab = "log-likelihood")
imax = which.max(yv)
abline(v = xv[imax], h = yv[imax], lwd = 1.5, col = "blue")
abline(h = yv[imax], lwd = 1.5, col = "purple")

xv[imax]
## [1] 0.58
## -----------------------------------------------------------------------------
phat  =  af(c(nMM, nMN, nNN))
phat
## [1] 0.5793103
pMM   =  phat^2
qhat  =  1 - phat

## -----------------------------------------------------------------------------
pHW = c(MM = phat^2, MN = 2*phat*qhat, NN = qhat^2)
sum(c(nMM, nMN, nNN)) * pHW
##       MM       MN       NN 
## 194.6483 282.7034 102.6483
Mourant[, c("MM", "MN", "NN")][c(1, 69, 128, 148, 192),]
##       MM   MN  NN
## 1   1743 3222 763
## 69   254  736 174
## 128  207  916 142
## 148  181  563  68
## 192   90  473  56
as.matrix(Mourant[, c("MM", "MN", "NN")])[c(1, 69, 128, 148, 192),]
##        MM   MN  NN
## [1,] 1743 3222 763
## [2,]  254  736 174
## [3,]  207  916 142
## [4,]  181  563  68
## [5,]   90  473  56
c("red", rep("purple", 4))
## [1] "red"    "purple" "purple" "purple" "purple"
## -----------------------------------------------------------------------------
par(mai = rep(0.1, 4))
pops = c(1, 69, 128, 148, 192)
genotypeFrequencies = as.matrix(Mourant[, c("MM", "MN", "NN")])
HWTernaryPlot(genotypeFrequencies[pops, ],
        markerlab = Mourant$Country[pops],
        alpha = 0.0001, curvecols = c("red", rep("purple", 4)),
        mcex = 0.75, vertex.cex = 1)

## -----------------------------------------------------------------------------
HWTernaryPlot(genotypeFrequencies[pops, ],
              markerlab = Mourant$Country[pops],
              curvecols = c("red", rep("purple", 4)),
              alpha = 0.0001, mcex = 0.75, vertex.cex = 1)

HWTernaryPlot(genotypeFrequencies[-pops, ], 
              newframe = FALSE, alpha = 0.0001, cex = 0.5)

## -----------------------------------------------------------------------------
newgf = round(genotypeFrequencies / 50)
HWTernaryPlot(newgf[pops, ],
              markerlab = Mourant$Country[pops],
              curvecols = c("red", rep("purple", 4)),
              alpha = 0.0001, mcex = 0.75, vertex.cex = 1)

## -----------------------------------------------------------------------------
library("seqLogo")
## 
## Attaching package: 'seqLogo'
## The following object is masked from 'package:mice':
## 
##     ic
setwd("D:/books/R/blogdown/hugo")
load("../data/kozak.RData")
kozak
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## A 0.33 0.25  0.4 0.15 0.20    1    0    0 0.05
## C 0.12 0.25  0.1 0.40 0.40    0    0    0 0.05
## G 0.33 0.25  0.4 0.20 0.25    0    0    1 0.90
## T 0.22 0.25  0.1 0.25 0.15    0    1    0 0.00
pwm = makePWM(kozak)
seqLogo(pwm, ic.scale = FALSE)

## -----------------------------------------------------------------------------
library("markovchain")
## Package:  markovchain
## Version:  0.9.1
## Date:     2023-01-20
## BugReport: https://github.com/spedygiorgio/markovchain/issues
library("igraph")
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:rtracklayer':
## 
##     blocks, path
## The following object is masked from 'package:Biostrings':
## 
##     union
## The following object is masked from 'package:XVector':
## 
##     path
## The following object is masked from 'package:GenomicRanges':
## 
##     union
## The following object is masked from 'package:IRanges':
## 
##     union
## The following object is masked from 'package:S4Vectors':
## 
##     union
## The following objects are masked from 'package:BiocGenerics':
## 
##     normalize, path, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
sequence = toupper(c("a", "c", "a", "c", "g", "t", "t", "t", "t", "c", "c",
                     "a", "c", "g", "t", "a", "c","c","c","a","a","a","t","a",
                     "c","g","g","c","a","t","g","t","g","t","g","a","g","c","t","g"))
mcFit   =  markovchainFit(data = sequence)
mcFit
## $estimate
## MLE Fit 
##  A  4 - dimensional discrete Markov Chain defined by the following states: 
##  A, C, G, T 
##  The transition matrix  (by rows)  is defined as follows: 
##           A         C         G          T
## A 0.2000000 0.5000000 0.1000000 0.20000000
## C 0.3636364 0.2727273 0.2727273 0.09090909
## G 0.1250000 0.2500000 0.1250000 0.50000000
## T 0.2000000 0.1000000 0.4000000 0.30000000
## 
## 
## $standardError
##           A         C         G          T
## A 0.1414214 0.2236068 0.1000000 0.14142136
## C 0.1818182 0.1574592 0.1574592 0.09090909
## G 0.1250000 0.1767767 0.1250000 0.25000000
## T 0.1414214 0.1000000 0.2000000 0.17320508
## 
## $confidenceLevel
## [1] 0.95
## 
## $lowerEndpointMatrix
##             A          C           G         T
## A 0.000000000 0.06173864 0.000000000 0.0000000
## C 0.007279201 0.00000000 0.000000000 0.0000000
## G 0.000000000 0.00000000 0.000000000 0.0100089
## T 0.000000000 0.00000000 0.008007122 0.0000000
## 
## $upperEndpointMatrix
##           A         C         G         T
## A 0.4771808 0.9382614 0.2959964 0.4771808
## C 0.7199935 0.5813416 0.5813416 0.2690877
## G 0.3699955 0.5964760 0.3699955 0.9899911
## T 0.4771808 0.2959964 0.7919929 0.6394758
## 
## $logLikelihood
## [1] -48.94867
MCgraph =  markovchain:::.getNet(mcFit$estimate, round = TRUE)
MCgraph
## IGRAPH 7e22123 DNW- 4 16 -- 
## + attr: name (v/c), weight (e/n)
## + edges from 7e22123 (vertex names):
##  [1] A->A A->C A->G A->T C->A C->C C->G C->T G->A G->C G->G G->T T->A T->C T->G
## [16] T->T
E(MCgraph)
## + 16/16 edges from 7e22123 (vertex names):
##  [1] A->A A->C A->G A->T C->A C->C C->G C->T G->A G->C G->G G->T T->A T->C T->G
## [16] T->T
E(MCgraph)$weight
##  [1] 20.00 50.00 10.00 20.00 36.36 27.27 27.27  9.09 12.50 25.00 12.50 50.00
## [13] 20.00 10.00 40.00 30.00
edgelab =  round(E(MCgraph)$weight / 100, 2)
edgelab
##  [1] 0.20 0.50 0.10 0.20 0.36 0.27 0.27 0.09 0.12 0.25 0.12 0.50 0.20 0.10 0.40
## [16] 0.30
## -----------------------------------------------------------------------------
par(mai=c(0,0,0,0))
plot.igraph(MCgraph, edge.label = edgelab,
       vertex.size = 40, xlim = c(-1, 1.25))

## -----------------------------------------------------------------------------
par(mai=c(0,0,0,0))
plot.igraph(MCgraph, edge.label = edgelab,
       vertex.size = 40, xlim = c(-1, 1.25))

## -----------------------------------------------------------------------------
setwd("D:/books/R/blogdown/hugo")
haplo6 = read.table("../data/haplotype6.txt", header = TRUE)
haplo6
##   Individual DYS19 DXYS156Y DYS389m DYS389n DYS389p
## 1         H1    14       12       4      12       3
## 2         H3    15       13       4      13       3
## 3         H4    15       11       5      11       3
## 4         H5    17       13       4      11       3
## 5         H7    13       12       5      12       3
## 6         H8    16       11       5      12       3
## -----------------------------------------------------------------------------
with(haplo6, stopifnot(Individual[1] == "H1", DYS19[1] == 14, DXYS156Y[1] == 12))
## -----------------------------------------------------------------------------
dfbetas = data.frame(
  p = rep(p_seq, 3),
  dbeta = c(dbeta(p_seq,  10,  30),
            dbeta(p_seq,  20,  60), 
            dbeta(p_seq,  50, 150)),
  pars = rep(c("Beta(10,30)", "Beta(20,60)", "Beta(50,150)"), each = length(p_seq)))
library("ggplot2")
## 
## Attaching package: 'ggplot2'
## The following object is masked _by_ '.GlobalEnv':
## 
##     stat
ggplot(dfbetas) +
  geom_line(aes(x = p, y = dbeta, colour = pars)) +
  theme(legend.title = element_blank()) +
  geom_vline(aes(xintercept = 0.25), colour = "#990000", linetype = "dashed") + # mean
  geom_vline(aes(xintercept = (10-1)/(10+30-2)), colour = "#888800") # mode

data <- rbeta(100000, 50, 350)
summary(data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.06693 0.11347 0.12434 0.12496 0.13572 0.21112
hist(data, breaks = 100, col = "skyblue", main = "")

## -----------------------------------------------------------------------------
rp = rbeta(100000, 50, 350)
y = vapply(rp, 
           function(x) rbinom(1, prob = x, size = 300), 
           integer(1))
hist(y, breaks = 50, col = "orange", main = "", xlab = "")

## -----------------------------------------------------------------------------
set.seed(0xbebe)
y1 = vapply(rp, 
            function(x) rbinom(1, prob = x, size = 300), 
            integer(1))
set.seed(0xbebe)
y2 = rbinom(length(rp), rp, size = 300)
stopifnot(identical(y1, y2))
hist(y2, breaks = 50, col = "orange", main = "", xlab = "")

## -----------------------------------------------------------------------------
pPostEmp = rp[ y == 40 ]
hist(pPostEmp, breaks = 40, col = "chartreuse4", main = "",
  probability = TRUE, xlab = "posterior p")

p_seq = seq(0, 1, by = 0.001)
densPostTheory = dbeta(p_seq, 50 + 40, 350 + 260)
lines(p_seq, densPostTheory, type = "l", lwd = 3)

## -----------------------------------------------------------------------------
mean(pPostEmp)
## [1] 0.1283359
dp = p_seq[2] - p_seq[1]
dp
## [1] 0.001
sum(p_seq * densPostTheory * dp)
## [1] 0.1285714
## -----------------------------------------------------------------------------
stopifnot(abs(mean(pPostEmp) - sum(p_seq * densPostTheory * dp)) < 1e-3)
## -----------------------------------------------------------------------------
pPostMC = rbeta(n = 100000, 90, 610)
mean(pPostMC)
## [1] 0.128571
## -----------------------------------------------------------------------------
qqplot(pPostMC, pPostEmp, type = "l", asp = 1)
abline(a = 0, b = 1, col = "blue")

## -----------------------------------------------------------------------------
densPost2 = dbeta(p_seq, 115, 735)
mcPost2   = rbeta(1e6, 115, 735)
sum(p_seq * densPost2 * dp)   # mean, by numeric integration
## [1] 0.1352941
mean(mcPost2)                 # mean by MC
## [1] 0.1352657
p_seq[which.max(densPost2)]   # MAP estimate
## [1] 0.134
## -----------------------------------------------------------------------------
quantile(mcPost2, c(0.025, 0.975))
##      2.5%     97.5% 
## 0.1131087 0.1590205
## -----------------------------------------------------------------------------
library("Biostrings")

## -----------------------------------------------------------------------------
## GENETIC_CODE
## IUPAC_CODE_MAP
## vignette(package = "Biostrings")
## vignette("BiostringsQuickOverview", package = "Biostrings")

## -----------------------------------------------------------------------------
GENETIC_CODE
## TTT TTC TTA TTG TCT TCC TCA TCG TAT TAC TAA TAG TGT TGC TGA TGG CTT CTC CTA CTG 
## "F" "F" "L" "L" "S" "S" "S" "S" "Y" "Y" "*" "*" "C" "C" "*" "W" "L" "L" "L" "L" 
## CCT CCC CCA CCG CAT CAC CAA CAG CGT CGC CGA CGG ATT ATC ATA ATG ACT ACC ACA ACG 
## "P" "P" "P" "P" "H" "H" "Q" "Q" "R" "R" "R" "R" "I" "I" "I" "M" "T" "T" "T" "T" 
## AAT AAC AAA AAG AGT AGC AGA AGG GTT GTC GTA GTG GCT GCC GCA GCG GAT GAC GAA GAG 
## "N" "N" "K" "K" "S" "S" "R" "R" "V" "V" "V" "V" "A" "A" "A" "A" "D" "D" "E" "E" 
## GGT GGC GGA GGG 
## "G" "G" "G" "G" 
## attr(,"alt_init_codons")
## [1] "TTG" "CTG"
IUPAC_CODE_MAP
##      A      C      G      T      M      R      W      S      Y      K      V 
##    "A"    "C"    "G"    "T"   "AC"   "AG"   "AT"   "CG"   "CT"   "GT"  "ACG" 
##      H      D      B      N 
##  "ACT"  "AGT"  "CGT" "ACGT"
## -----------------------------------------------------------------------------
library("BSgenome")
ag = available.genomes()
length(ag)
## [1] 112
ag[1:2]
## [1] "BSgenome.Alyrata.JGI.v1"              
## [2] "BSgenome.Amellifera.BeeBase.assembly4"
## -----------------------------------------------------------------------------
library("BSgenome.Ecoli.NCBI.20080805")
Ecoli
## E. coli genome:
## # organism: Escherichia coli (E. coli)
## # genome: 2008/08/05
## # provider: NCBI
## # release date: NA
## # 13 sequences:
## #   NC_008253 NC_008563 NC_010468 NC_004431 NC_009801 NC_009800 NC_002655
## #   NC_002695 NC_010498 NC_007946 NC_010473 NC_000913 AC_000091          
## # (use 'seqnames()' to see all the sequence names, use the '$' or '[[' operator
## # to access a given sequence)
shineDalgarno = "AGGAGGT"
ecoli = Ecoli$NC_010473
ecoli
## 4686137-letter DNAString object
## seq: AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTC...ATCACCAAATAAAAAACGCCTTAGTAAGTATTTTTC
## -----------------------------------------------------------------------------
window = 50000
starts = seq(1, length(ecoli) - window, by = window)
length(ecoli) - window
## [1] 4636137
head(starts)
## [1]      1  50001 100001 150001 200001 250001
seq_along(starts)
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## [51] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## [76] 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
starts[93]
## [1] 4600001
ends   = starts + window - 1
head(ends)
## [1]  50000 100000 150000 200000 250000 300000
seq_along(ends)
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## [51] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## [76] 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
ends[93]
## [1] 4650000
numMatches = vapply(seq_along(starts), function(i) {
  countPattern(shineDalgarno, ecoli[starts[i]:ends[i]],
               max.mismatch = 0)
  }, numeric(1))
table(numMatches)
## numMatches
##  0  1  2  3  4 
## 48 32  8  3  2
mean(numMatches) # the maximum likelihood estimate of poisson lambda is the mean
## [1] 0.6989247
dpois(c(0, 1, 2, 3, 4), lambda = 0.6989247)*length(numMatches) # the prediction of poisson distribution
## [1] 46.2321199 32.3127706 11.2920967  2.6307751  0.4596784
## -----------------------------------------------------------------------------
library("vcd")
gf = goodfit(numMatches, "poisson")
summary(gf)
## 
##   Goodness-of-fit test for poisson distribution
## 
##                       X^2 df  P(> X^2)
## Likelihood Ratio 4.134932  3 0.2472577
gf$observed
## [1] 48 32  8  3  2
gf$count
## [1] 0 1 2 3 4
gf$df
## [1] 3
gf$method
## [1] "ML"
gf$fitted
## [1] 46.2321185 32.3127710 11.2920974  2.6307754  0.4596785
gf$par
## $lambda
## [1] 0.6989247
distplot(numMatches, type = "poisson")

## -----------------------------------------------------------------------------
sdMatches = matchPattern(shineDalgarno, ecoli, max.mismatch = 0)
head(sdMatches)
## Views on a 4686137-letter DNAString subject
## subject: AGCTTTTCATTCTGACTGCAACGGGCAATATGTC...CACCAAATAAAAAACGCCTTAGTAAGTATTTTTC
## views:
##        start    end width
##   [1]  56593  56599     7 [AGGAGGT]
##   [2] 199644 199650     7 [AGGAGGT]
##   [3] 202176 202182     7 [AGGAGGT]
##   [4] 214433 214439     7 [AGGAGGT]
##   [5] 217429 217435     7 [AGGAGGT]
##   [6] 227331 227337     7 [AGGAGGT]
## -----------------------------------------------------------------------------
betweenmotifs = gaps(sdMatches)
head(betweenmotifs)
## Views on a 4686137-letter DNAString subject
## subject: AGCTTTTCATTCTGACTGCAACGGGCAATATGTC...CACCAAATAAAAAACGCCTTAGTAAGTATTTTTC
## views:
##        start    end  width
##   [1]      1  56592  56592 [AGCTTTTCATTCTGACTGCAACGG...TCCAGGTGTCAGAACCCGGCAGAC]
##   [2]  56600 199643 143044 [AAAGCTACCGTTATCCAGAATGGT...TGGGAGAGCGCCTGCTTTGCACGC]
##   [3] 199651 202175   2525 [CTGCGGTTCGATCCCGCATAGCTC...GTTGGCTAATCCTGGTCGGACATC]
##   [4] 202183 214432  12250 [TAGTGCAATGGCATAAGCCAGCTT...ATTATCGTGTTATCGCCAGGCTTT]
##   [5] 214440 217428   2989 [TAATAACATGGGCAGGATAAGCTC...CATAACGAAAAGCCCCTTACTTGT]
##   [6] 217436 227330   9895 [CTGACCACTTGTGATGATATGGTT...CGTTTTCTGATTCCACAAACTGCA]
length(betweenmotifs)
## [1] 66
head(width(betweenmotifs))
## [1]  56592 143044   2525  12250   2989   9895
mean(width(betweenmotifs))
## [1] 70995.18
table(width(betweenmotifs))
## 
##    431    921   2525   2526   2989   4731   5029   5509   6066   9373   9895 
##      1      1      1      1      1      1      1      1      1      1      1 
##  10156  10395  11659  11825  12250  18141  18402  21195  22018  23136  23923 
##      1      1      1      1      1      1      1      1      1      1      1 
##  27523  27604  27756  27949  29936  38566  39295  41394  41401  44557  46875 
##      1      1      1      1      1      1      1      1      1      1      1 
##  47744  51681  53830  56080  56592  59234  59292  62938  63513  64798  69573 
##      1      1      1      1      1      1      1      1      1      1      1 
##  74505  79547  86205  86243  87327  90295  90941  94206  97259  97557 113253 
##      1      1      1      1      1      1      1      1      1      1      1 
## 119872 134895 143044 153621 179327 189903 241430 251600 262848 311563 429015 
##      1      1      1      1      1      1      1      1      1      1      1
## -----------------------------------------------------------------------------
library("Renext")
## Loading required package: evd
## 
## Attaching package: 'evd'
## The following object is masked from 'package:igraph':
## 
##     clusters
expplot(width(betweenmotifs), rate = 1/mean(width(betweenmotifs)),
        labels = "fit")

## -----------------------------------------------------------------------------
## gofExp.test(width(betweenmotifs))

## -----------------------------------------------------------------------------
library("BSgenome.Hsapiens.UCSC.hg19")
setwd("D:/books/R/blogdown/hugo")
chr8  =  Hsapiens$chr8
CpGtab = read.table("../data/model-based-cpg-islands-hg19.txt",
                    header = TRUE)
nrow(CpGtab)
## [1] 65699
head(CpGtab)
##     chr  start    end length CpGcount GCcontent pctGC obsExp
## 1 chr10  93098  93818    721       32       403 0.559  0.572
## 2 chr10  94002  94165    164       12        97 0.591  0.841
## 3 chr10  94527  95302    776       65       538 0.693  0.702
## 4 chr10 119652 120193    542       53       369 0.681  0.866
## 5 chr10 122133 122621    489       51       339 0.693  0.880
## 6 chr10 180265 180720    456       32       256 0.561  0.893
irCpG = with(dplyr::filter(CpGtab, chr == "chr8"),
         IRanges(start = start, end = end))
head(irCpG)
## IRanges object with 6 ranges and 0 metadata columns:
##           start       end     width
##       <integer> <integer> <integer>
##   [1]     32437     33708      1272
##   [2]     40354     40656       303
##   [3]     44536     46203      1668
##   [4]     99457    100721      1265
##   [5]    155954    156761       808
##   [6]    179033    179756       724
## -----------------------------------------------------------------------------
grCpG = GRanges(ranges = irCpG, seqnames = "chr8", strand = "+")
genome(grCpG) = "hg19"
head(grCpG)
## GRanges object with 6 ranges and 0 metadata columns:
##       seqnames        ranges strand
##          <Rle>     <IRanges>  <Rle>
##   [1]     chr8   32437-33708      +
##   [2]     chr8   40354-40656      +
##   [3]     chr8   44536-46203      +
##   [4]     chr8  99457-100721      +
##   [5]     chr8 155954-156761      +
##   [6]     chr8 179033-179756      +
##   -------
##   seqinfo: 1 sequence from hg19 genome; no seqlengths
## -----------------------------------------------------------------------------
library("Gviz")
ideo = IdeogramTrack(genome = "hg19", chromosome = "chr8")
plotTracks(
  list(GenomeAxisTrack(),
    AnnotationTrack(grCpG, name = "CpG"), ideo),
    from = 2200000, to = 5800000,
    shape = "box", fill = "#006400", stacking = "dense")

## -----------------------------------------------------------------------------
CGIview    = Views(unmasked(Hsapiens$chr8), irCpG)
head(CGIview)
## Views on a 146364022-letter DNAString subject
## subject: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
## views:
##        start    end width
##   [1]  32437  33708  1272 [CGGGTGCCATCTCAGCAGCTCACGG...AGAACACGGCGGGGGGGGCGGCGC]
##   [2]  40354  40656   303 [CGATTAGCGGGAGCGCAAAGCGAAG...GCGGTCGGCTCGGCGAGGAGCCGG]
##   [3]  44536  46203  1668 [CGGCGCTGCAGGAGAGGAGATGCCC...GAGTTCATGGGTGTGACGGGGCGT]
##   [4]  99457 100721  1265 [CGGCTGGCCGGGCAGGGGGGCTGAC...ATCCTCCCGGCGGTCGGGCGGCGG]
##   [5] 155954 156761   808 [CGGGAAAGATTTTATTCACCGTCGA...CGCAGTATACTGGCGGCACGCCGC]
##   [6] 179033 179756   724 [CGTGTTCCGGGGGTTATATGAGTGT...CCCTAGAGCTCAAGGCACTGTCGG]
NonCGIview = Views(unmasked(Hsapiens$chr8), gaps(irCpG))
head(NonCGIview)
## Views on a 146364022-letter DNAString subject
## subject: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
## views:
##        start    end width
##   [1]  33709  40353  6645 [TCTTCTCATCACGTGCTCTCAGGGG...CGCGGAGCAGCAGGCACTCATTCC]
##   [2]  40657  44535  3879 [TCTCCAAGTGCCGCCAGCTGCGGGA...CGCGGGTTCTCTGTGGCCAGCAGG]
##   [3]  46204  99456 53253 [GTGCTGTGTGAGAACATGTGTGTAG...GCGCCCCTCACCTCCCGGACGGGG]
##   [4] 100722 155953 55232 [CGGCTGCCCTAAGTGATCTTTTTAA...ACCTTGATTATTAACGGCTGCAAC]
##   [5] 156762 179032 22271 [CTGCTGGCAGCTAGGGACATTGCAG...TATGTACGCTCTGAGAGATACATG]
##   [6] 179757 181776  2020 [AAGCTGAGCGCCCTCTGCTACCCCT...CATCCACGCCTGTCCCAAAGGTAC]
head(DNAStringSet(CGIview))
## DNAStringSet object of length 6:
##     width seq
## [1]  1272 CGGGTGCCATCTCAGCAGCTCACGGTGTGGAAAC...GTTTGCGGAAGAACACGGCGGGGGGGGCGGCGC
## [2]   303 CGATTAGCGGGAGCGCAAAGCGAAGGGGCGGCCT...CGCGCACGCGCGGTCGGCTCGGCGAGGAGCCGG
## [3]  1668 CGGCGCTGCAGGAGAGGAGATGCCCAGGCCAGGC...TCTCGGTGTGAGTTCATGGGTGTGACGGGGCGT
## [4]  1265 CGGCTGGCCGGGCAGGGGGGCTGACCCCCCCCAC...CTGAACTCCATCCTCCCGGCGGTCGGGCGGCGG
## [5]   808 CGGGAAAGATTTTATTCACCGTCGATGCGGCCCC...TCTCTTGCTCGCAGTATACTGGCGGCACGCCGC
## [6]   724 CGTGTTCCGGGGGTTATATGAGTGTGACGGGTGT...ACTGCAGCGCCCTAGAGCTCAAGGCACTGTCGG
## -----------------------------------------------------------------------------
seqCGI      = as(CGIview, "DNAStringSet")
head(seqCGI)
## DNAStringSet object of length 6:
##     width seq
## [1]  1272 CGGGTGCCATCTCAGCAGCTCACGGTGTGGAAAC...GTTTGCGGAAGAACACGGCGGGGGGGGCGGCGC
## [2]   303 CGATTAGCGGGAGCGCAAAGCGAAGGGGCGGCCT...CGCGCACGCGCGGTCGGCTCGGCGAGGAGCCGG
## [3]  1668 CGGCGCTGCAGGAGAGGAGATGCCCAGGCCAGGC...TCTCGGTGTGAGTTCATGGGTGTGACGGGGCGT
## [4]  1265 CGGCTGGCCGGGCAGGGGGGCTGACCCCCCCCAC...CTGAACTCCATCCTCCCGGCGGTCGGGCGGCGG
## [5]   808 CGGGAAAGATTTTATTCACCGTCGATGCGGCCCC...TCTCTTGCTCGCAGTATACTGGCGGCACGCCGC
## [6]   724 CGTGTTCCGGGGGTTATATGAGTGTGACGGGTGT...ACTGCAGCGCCCTAGAGCTCAAGGCACTGTCGG
length(seqCGI)
## [1] 2855
seqNonCGI   = as(NonCGIview, "DNAStringSet")
head(seqNonCGI)
## DNAStringSet object of length 6:
##     width seq
## [1]  6645 TCTTCTCATCACGTGCTCTCAGGGGCCACGATGT...CCGCAAGAACGCGGAGCAGCAGGCACTCATTCC
## [2]  3879 TCTCCAAGTGCCGCCAGCTGCGGGATTTCCTCTG...GCCGGCGCACGCGGGTTCTCTGTGGCCAGCAGG
## [3] 53253 GTGCTGTGTGAGAACATGTGTGTAGTGTTCACAT...CGGGCAGAGGCGCCCCTCACCTCCCGGACGGGG
## [4] 55232 CGGCTGCCCTAAGTGATCTTTTTAAGTAAAGGAG...TGGTCTCTGACCTTGATTATTAACGGCTGCAAC
## [5] 22271 CTGCTGGCAGCTAGGGACATTGCAGGGCCCTCTT...CCCGGGCGGTATGTACGCTCTGAGAGATACATG
## [6]  2020 AAGCTGAGCGCCCTCTGCTACCCCTCCTGCTGCA...ATGGGATGTCATCCACGCCTGTCCCAAAGGTAC
length(seqNonCGI)
## [1] 2854
dinucCpG    = sapply(seqCGI, dinucleotideFrequency)
dim(dinucCpG)
## [1]   16 2855
dinucCpG[,1]
##  AA  AC  AG  AT  CA  CC  CG  CT  GA  GC  GG  GT  TA  TC  TG  TT 
##  49 127  55   8 132  44 116 112  54 129 107 100   4 104 112  18
dinucNonCpG = sapply(seqNonCGI, dinucleotideFrequency)
dim(dinucNonCpG)
## [1]   16 2854
dinucNonCpG[, 1]
##  AA  AC  AG  AT  CA  CC  CG  CT  GA  GC  GG  GT  TA  TC  TG  TT 
## 389 351 400 436 498 560 112 603 359 336 403 336 330 527 519 485
NonICounts = rowSums(dinucNonCpG)
NonICounts
##       AA       AC       AG       AT       CA       CC       CG       CT 
## 14223322  7129981  9795572 11291315 10241505  6931732  1174927  9779692 
##       GA       GC       GG       GT       TA       TC       TG       TT 
##  8372468  5706958  6934755  7112531  9602902  8359398 10221420 14197986
IslCounts  = rowSums(dinucCpG)
IslCounts
##     AA     AC     AG     AT     CA     CC     CG     CT     GA     GC     GG 
##  64103  83109 122131  44000 113346 193847 133549 122377 105186 177326 194517 
##     GT     TA     TC     TG     TT 
##  86752  31210 106738 114592  65861
## -----------------------------------------------------------------------------
TI  = matrix(IslCounts, ncol = 4, byrow = TRUE)
TnI = matrix(NonICounts, ncol = 4, byrow = TRUE)
dimnames(TI) = dimnames(TnI) =
  list(c("A", "C", "G", "T"), c("A", "C", "G", "T"))
head(TI)
##        A      C      G      T
## A  64103  83109 122131  44000
## C 113346 193847 133549 122377
## G 105186 177326 194517  86752
## T  31210 106738 114592  65861
head(TnI)
##          A       C        G        T
## A 14223322 7129981  9795572 11291315
## C 10241505 6931732  1174927  9779692
## G  8372468 5706958  6934755  7112531
## T  9602902 8359398 10221420 14197986
## -----------------------------------------------------------------------------
MI = TI /rowSums(TI)
MI
##            A         C         G         T
## A 0.20457773 0.2652333 0.3897678 0.1404212
## C 0.20128250 0.3442381 0.2371595 0.2173200
## G 0.18657245 0.3145299 0.3450223 0.1538754
## T 0.09802105 0.3352314 0.3598984 0.2068492
MN = TnI / rowSums(TnI)
MN
##           A         C          G         T
## A 0.3351380 0.1680007 0.23080886 0.2660524
## C 0.3641054 0.2464366 0.04177094 0.3476871
## G 0.2976696 0.2029017 0.24655406 0.2528746
## T 0.2265813 0.1972407 0.24117528 0.3350027

The transitions are different. For instance, the transitions from C to A and T to A for in the islands (MI) transition matrix seem very different (0.201 versus 0.098).

alphabetFrequency(seqCGI, baseOnly = TRUE, collapse = TRUE)
##      A      C      G      T  other 
## 313845 563875 564789 318990      0
## -----------------------------------------------------------------------------
freqIsl = alphabetFrequency(seqCGI, baseOnly = TRUE, collapse = TRUE)[1:4]
freqIsl
##      A      C      G      T 
## 313845 563875 564789 318990
freqIsl / sum(freqIsl)
##         A         C         G         T 
## 0.1781693 0.3201109 0.3206298 0.1810901
freqNon = alphabetFrequency(seqNonCGI, baseOnly = TRUE, collapse = TRUE)[1:4]
freqNon
##        A        C        G        T 
## 42440774 28128852 28127513 42382186
freqNon / sum(freqNon)
##         A         C         G         T 
## 0.3008292 0.1993832 0.1993737 0.3004139

This shows an inverse pattern: in the CpG islands, C and G have frequencies around 0.32, whereas in the non-CpG islands, we have A and T that have frequencies around 0.30.

## -----------------------------------------------------------------------------
alpha = log((freqIsl/sum(freqIsl)) / (freqNon/sum(freqNon)))
alpha
##          A          C          G          T 
## -0.5238084  0.4734387  0.4751059 -0.5061665
beta  = log(MI / MN)
beta
##            A         C         G          T
## A -0.4935945 0.4566418 0.5239611 -0.6390469
## C -0.5927341 0.3342289 1.7365319 -0.4699321
## G -0.4671646 0.4383576 0.3360277 -0.4967508
## T -0.8379216 0.5303960 0.4002977 -0.4821485
s = unlist(strsplit("ACGTTATACTACG", ""))
s
##  [1] "A" "C" "G" "T" "T" "A" "T" "A" "C" "T" "A" "C" "G"
beta[s[2-1], s[2]]
## [1] 0.4566418
## -----------------------------------------------------------------------------
x = "ACGTTATACTACG"
scorefun = function(x) {
  s = unlist(strsplit(x, ""))
  score = alpha[s[1]]
  if (length(s) >= 2)
    for (j in 2:length(s))
      score = score + beta[s[j-1], s[j]]
  score
}
scorefun(x)
##          A 
## -0.2824623
## -----------------------------------------------------------------------------
generateRandomScores = function(s, len = 100, B = 1000) {
  alphFreq = alphabetFrequency(s)
  isGoodSeq = rowSums(alphFreq[, 5:ncol(alphFreq)]) == 0
  s = s[isGoodSeq]
  slen = sapply(s, length)
  prob = pmax(slen - len, 0)
  prob = prob / sum(prob)
  idx  = sample(length(s), B, replace = TRUE, prob = prob)
  ssmp = s[idx]
  start = sapply(ssmp, function(x) sample(length(x) - len, 1))
  scores = sapply(seq_len(B), function(i)
    scorefun(as.character(ssmp[[i]][start[i]+(1:len)]))
  )
  scores / len
}
scoresCGI    = generateRandomScores(seqCGI)
scoresNonCGI = generateRandomScores(seqNonCGI)
## -----------------------------------------------------------------------------
rgs = range(c(scoresCGI, scoresNonCGI))
rgs
## [1] -0.6191153  0.6016392
br = seq(rgs[1], rgs[2], length.out = 50)
h1 = hist(scoresCGI,    breaks = br, plot = FALSE)
h2 = hist(scoresNonCGI, breaks = br, plot = FALSE)
plot(h1, col = rgb(0, 0, 1, 1/4), xlim = c(-0.5, 0.5), ylim=c(0,120))
plot(h2, col = rgb(1, 0, 0, 1/4), add = TRUE)

## -----------------------------------------------------------------------------
## ###This is for provenance reasons, keep track of how the data
## ###were generated for the EM exercise in Chapter 4.
## Mdata=c(scoresCGI,scoresNonCGI)
## MM1=sample(Mdata[1:1000],800)
## MM2=sample(Mdata[1001:2000],1000)
## Myst=c(MM1,MM2);names(Myst)=NULL
## saveRDS(c(MM1,MM2),"../data/Myst.rds")
## ###True value of m1,m2,s1 and s2
## ###

## -----------------------------------------------------------------------------
stopifnot(max(h1$counts) < 120, max(h2$counts) < 120,
          h1$breaks[1] >= br[1], h1$breaks[length(h1$breaks)] <= br[length(br)],
          h2$breaks[1] >= br[1], h2$breaks[length(h2$breaks)] <= br[length(br)])
## -----------------------------------------------------------------------------
setwd("D:/books/R/blogdown/hugo")
mtb = read.table("../data/M_tuberculosis.txt", header = TRUE)
head(mtb, n = 4)
##   AmAcid Codon Number PerThous
## 1    Gly   GGG  25874    19.25
## 2    Gly   GGA  13306     9.90
## 3    Gly   GGT  25320    18.84
## 4    Gly   GGC  68310    50.82
## -----------------------------------------------------------------------------
Gly  =  mtb[ mtb$AmAcid == "Gly", "Number"]
Gly
## [1] 25874 13306 25320 68310
Gly/sum(Gly)
## [1] 0.1948197 0.1001882 0.1906483 0.5143438
dim(mtb)
## [1] 64  4
## -----------------------------------------------------------------------------
pro  =  mtb[ mtb$AmAcid == "Pro", "Number"]
pro
## [1] 42424  8229  4578 22895
pro/sum(pro)
## [1] 0.54302025 0.10532985 0.05859765 0.29305225
table(mtb$AmAcid)
## 
## Ala Arg Asn Asp Cys End Gln Glu Gly His Ile Leu Lys Met Phe Pro Ser Thr Trp Tyr 
##   4   6   2   2   2   3   2   2   4   2   3   6   2   1   2   4   6   4   1   2 
## Val 
##   4
table(mtb$Codon)
## 
## AAA AAC AAG AAT ACA ACC ACG ACT AGA AGC AGG AGT ATA ATC ATG ATT CAA CAC CAG CAT 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## CCA CCC CCG CCT CGA CGC CGG CGT CTA CTC CTG CTT GAA GAC GAG GAT GCA GCC GCG GCT 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## GGA GGC GGG GGT GTA GTC GTG GTT TAA TAC TAG TAT TCA TCC TCG TCT TGA TGC TGG TGT 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## TTA TTC TTG TTT 
##   1   1   1   1
## -----------------------------------------------------------------------------
setwd("D:/books/R/blogdown/hugo")
staph = readDNAStringSet("../data/staphsequence.ffn.txt", "fasta")

## -----------------------------------------------------------------------------
staph[1:3, ]
## DNAStringSet object of length 3:
##     width seq                                               names               
## [1]  1362 ATGTCGGAAAAAGAAATTTGGGA...AAAAAGAAATAAGAAATGTATAA lcl|NC_002952.2_c...
## [2]  1134 ATGATGGAATTCACTATTAAAAG...TTTTACCAATCAGAACTTACTAA lcl|NC_002952.2_c...
## [3]   246 GTGATTATTTTGGTTCAAGAAGT...TCATTCATCAAGGTGAACAATGA lcl|NC_002952.2_c...
staph
## DNAStringSet object of length 2650:
##        width seq                                            names               
##    [1]  1362 ATGTCGGAAAAAGAAATTTGGG...AAAGAAATAAGAAATGTATAA lcl|NC_002952.2_c...
##    [2]  1134 ATGATGGAATTCACTATTAAAA...TTACCAATCAGAACTTACTAA lcl|NC_002952.2_c...
##    [3]   246 GTGATTATTTTGGTTCAAGAAG...ATTCATCAAGGTGAACAATGA lcl|NC_002952.2_c...
##    [4]  1113 ATGAAGTTAAATACACTCCAAT...CAAGGTGAAATTATAAAGTAA lcl|NC_002952.2_c...
##    [5]  1932 GTGACTGCATTGTCAGATGTAA...TATGCAAACTTAGACTTCTAA lcl|NC_002952.2_c...
##    ...   ... ...
## [2646]   720 ATGACTGTAGAATGGTTAGCAG...ACTCCTTTACTTGAAAAATAA lcl|NC_002952.2_c...
## [2647]  1878 GTGGTTCAAGAATATGATGTAA...CTCCAAAGGGTGAGTGACTAA lcl|NC_002952.2_c...
## [2648]  1380 ATGGATTTAGATACAATTACGA...CAATTCTGCTTAGGTAAATAG lcl|NC_002952.2_c...
## [2649]   348 TTGGAAAAAGCTTACCGAATTA...TTTAATAAAAAGATTAAGTAA lcl|NC_002952.2_c...
## [2650]   138 ATGGTAAAACGTACTTATCAAC...CGTAAAGTTTTATCTGCATAA lcl|NC_002952.2_c...
## -----------------------------------------------------------------------------
letterFrequency(staph[[1]], letters = "ACGT", OR = 0)
##   A   C   G   T 
## 522 219 229 392
GCstaph = data.frame(
  ID = names(staph),
  GC = rowSums(alphabetFrequency(staph)[, 2:3] / width(staph)) * 100
)
## -----------------------------------------------------------------------------
window = 100
gc = rowSums( letterFrequencyInSlidingView(staph[[364]], window,
      c("G","C")))/window
plot(x = seq(along = gc), y = gc, type = "l")

## -----------------------------------------------------------------------------
plot(x = seq(along = gc), y = gc, type = "l")
lines(lowess(x = seq(along = gc), y = gc, f = 0.2), col = 2)

## -----------------------------------------------------------------------------
dfbetas = data.frame(
  p = rep(p_seq, 5),
  dbeta = c(dbeta(p_seq, 0.5, 0.5), 
            dbeta(p_seq,   1,   1), 
            dbeta(p_seq,  10,  30),
            dbeta(p_seq,  20,  60), 
            dbeta(p_seq,  50, 150)),
  pars = rep(c("Beta(0.5,0.5)", "U(0,1)=Beta(1,1)", 
               "Beta(10,30)", "Beta(20,60)", 
               "Beta(50,150)"), each = length(p_seq)))
ggplot(dfbetas) +
  geom_line(aes(x = p, y = dbeta, colour = pars)) +
  theme(legend.title = element_blank()) +
  geom_vline(aes(xintercept = 0.25), colour = "#990000", linetype = "dashed")

3 High Quality Graphics in R

## -----------------------------------------------------------------------------
## ## produces the xkcd-plotting image used below
library("xkcd")
## Loading required package: extrafont
## Registering fonts with R
library("showtext")
## Loading required package: sysfonts
## Loading required package: showtextdb
## 
## Attaching package: 'showtextdb'
## The following object is masked from 'package:extrafont':
## 
##     font_install
library("sysfonts")
library("tibble")
## 
## Attaching package: 'tibble'
## The following object is masked from 'package:igraph':
## 
##     as_data_frame
## 
introplotdata = tibble(
   y = c(seq(-8, 1, length=25)^2, rep(1, 5), seq(1, 5,length=25)^2)^2,
   x = seq(1, 55, length.out = length(y)))

 dataman = tibble(
   x = 30,
   y = 400,
   scale = 100,
   ratioxy = 0.1,
   angleofspine =  -pi/2 ,
   anglerighthumerus = -pi/6,
   anglelefthumerus  = pi * 7/6,
   anglerightradius = 0,
   angleleftradius = 0,
   angleleftleg  = 19*pi/12,
   anglerightleg = 17*pi/12,
   angleofneck   = 1.4*pi)
 
 mapping = do.call(aes_string, colnames(dataman) %>% (function(x) setNames(as.list(x), x)))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(introplotdata) + geom_line(aes(x = x, y = y), size = 2) +
    xkcdaxis(c(0, 50), c(0, 1000)) + xlab("Time to make plot in minutes") +
    ylab("Time to understand plot in minutes") + xkcdman(mapping, dataman) +
    theme(axis.title.x = element_text(margin = margin(15, 0, 0, 0)))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in theme_xkcd(): Not xkcd fonts installed! See vignette("xkcd-intro")

## -----------------------------------------------------------------------------
head(DNase)
##   Run       conc density
## 1   1 0.04882812   0.017
## 2   1 0.04882812   0.018
## 3   1 0.19531250   0.121
## 4   1 0.19531250   0.124
## 5   1 0.39062500   0.206
## 6   1 0.39062500   0.215
plot(DNase$conc, DNase$density)

## -----------------------------------------------------------------------------
plot(DNase$conc, DNase$density,
  ylab = attr(DNase, "labels")$y,
  xlab = paste(attr(DNase, "labels")$x, attr(DNase, "units")$x),
  pch = 3,
  col = "blue")

## -----------------------------------------------------------------------------
hist(DNase$density, breaks=25, main = "")

boxplot(density ~ Run, data = DNase)

## -----------------------------------------------------------------------------
library("Hiiragi2013")
## Loading required package: affy
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: boot
## Loading required package: clue
## Loading required package: cluster
## Loading required package: genefilter
## Loading required package: geneplotter
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
## The following object is masked from 'package:evd':
## 
##     qq
## Loading required package: annotate
## Loading required package: AnnotationDbi
## Loading required package: XML
## 
## Attaching package: 'geneplotter'
## The following object is masked from 'package:Gviz':
## 
##     imageMap
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:rtracklayer':
## 
##     space
## The following object is masked from 'package:IRanges':
## 
##     space
## The following object is masked from 'package:S4Vectors':
## 
##     space
## The following object is masked from 'package:stats':
## 
##     lowess
## Loading required package: gtools
## 
## Attaching package: 'gtools'
## The following objects are masked from 'package:boot':
## 
##     inv.logit, logit
## The following object is masked from 'package:igraph':
## 
##     permute
## Loading required package: KEGGREST
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked _by_ '.GlobalEnv':
## 
##     genotype
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:genefilter':
## 
##     area
## Loading required package: mouse4302.db
## Loading required package: org.Mm.eg.db
## 
## 
## Loading required package: RColorBrewer
## Loading required package: xtable
data("x")
dim(Biobase::exprs(x))
## [1] 45101   101
## -----------------------------------------------------------------------------
head(pData(x), n = 2)
##         File.name Embryonic.day Total.number.of.cells lineage genotype
## 1 E3.25  1_C32_IN         E3.25                    32               WT
## 2 E3.25  2_C32_IN         E3.25                    32               WT
##           ScanDate sampleGroup sampleColour
## 1 E3.25 2011-03-16       E3.25      #CAB2D6
## 2 E3.25 2011-03-16       E3.25      #CAB2D6
## -----------------------------------------------------------------------------
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:igraph':
## 
##     as_data_frame, groups, union
## The following object is masked from 'package:HardyWeinberg':
## 
##     recode
## The following objects are masked from 'package:Biostrings':
## 
##     collapse, intersect, setdiff, setequal, union
## The following object is masked from 'package:XVector':
## 
##     slice
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
groups = group_by(pData(x), sampleGroup) %>%
  summarise(n = n(), color = unique(sampleColour))
groups
## # A tibble: 8 × 3
##   sampleGroup         n color  
##   <chr>           <int> <chr>  
## 1 E3.25              36 #CAB2D6
## 2 E3.25 (FGF4-KO)    17 #FDBF6F
## 3 E3.5 (EPI)         11 #A6CEE3
## 4 E3.5 (FGF4-KO)      8 #FF7F00
## 5 E3.5 (PE)          11 #B2DF8A
## 6 E4.5 (EPI)          4 #1F78B4
## 7 E4.5 (FGF4-KO)     10 #E31A1C
## 8 E4.5 (PE)           4 #33A02C
## -----------------------------------------------------------------------------
## f(x) %>% g(y) %>% h
## h(g(f(x), y))

## -----------------------------------------------------------------------------
library("ggplot2")
ggplot(DNase, aes(x = conc, y = density)) + geom_point()

## -----------------------------------------------------------------------------
ggplot(groups, aes(x = sampleGroup, y = n)) +
  geom_bar(stat = "identity")

## -----------------------------------------------------------------------------
## check an assertion made in the text above
stopifnot(formals(ggplot2::geom_bar)$stat=="count")
## -----------------------------------------------------------------------------
groupColor = setNames(groups$color, groups$sampleGroup)

## -----------------------------------------------------------------------------
ggplot(groups, aes(x = sampleGroup, y = n, fill = sampleGroup)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = groupColor, name = "Groups") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

## -----------------------------------------------------------------------------
gg = ggplot(DNase, aes(x = conc, y = density)) + geom_point()

## -----------------------------------------------------------------------------
## gg
print(gg)

## -----------------------------------------------------------------------------
ggplot2::ggsave("DNAse-histogram-demo.pdf", plot = gg)
## Saving 6 x 5 in image
## -----------------------------------------------------------------------------
file.remove("DNAse-histogram-demo.pdf")
## [1] TRUE
## -----------------------------------------------------------------------------
library("mouse4302.db")
## -----------------------------------------------------------------------------
## # I used this code to find the below two probes
idx = order(rowVars(Biobase::exprs(x)), decreasing=TRUE)[seq_len(2000)]
cc  = cor(t(Biobase::exprs(x)[idx,]))
cco = order(cc)[seq(1, 1001, by=2) ]
jj2 = rownames(Biobase::exprs(x))[ idx[ (cco-1) %/% length(idx) + 1 ] ]
jj1 = rownames(Biobase::exprs(x))[ idx[ (cco-1) %%  length(idx) + 1 ] ]
dftx = as.data.frame(t(Biobase::exprs(x)))
#par(ask=TRUE)
for(i in seq(along = cco)) {
  df = AnnotationDbi::select(mouse4302.db,
                             keys = c(jj1[i], jj2[i]), keytype = "PROBEID",
    columns = c("SYMBOL", "GENENAME"))
   print(ggplot(dftx, aes( x = get(jj1[i]), y = get(jj2[i]))) +
   geom_point(shape = 1) +
   xlab(paste(jj1[i], df$SYMBOL[1])) +
   ylab(paste(jj2[i], df$SYMBOL[2])) +
   ggtitle(round(cc[jj1[i], jj2[i]], 3)) + geom_smooth(method = "loess"))
}
## 'select()' returned 1:1 mapping between keys and columns
## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:many mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'
## 'select()' returned 1:1 mapping between keys and columns

## `geom_smooth()` using formula = 'y ~ x'

## -----------------------------------------------------------------------------
dftx = data.frame(t(Biobase::exprs(x)), pData(x))
ggplot( dftx, aes( x = X1426642_at, y = X1418765_at )) +
  geom_point( shape = 1 ) +
  geom_smooth( method = "loess" )
## `geom_smooth()` using formula = 'y ~ x'

## -----------------------------------------------------------------------------
stopifnot(is(dftx, "data.frame"))
## -----------------------------------------------------------------------------
.one <- AnnotationDbi::select(mouse4302.db, 
                              keys = "1418765_at", 
                              keytype = "PROBEID", 
                              columns = "SYMBOL")$SYMBOL
## 'select()' returned 1:1 mapping between keys and columns
.two <- AnnotationDbi::select(mouse4302.db, 
                              keys = "1426642_at", 
                              keytype = "PROBEID", 
                              columns = "SYMBOL")$SYMBOL
## 'select()' returned 1:1 mapping between keys and columns
## -----------------------------------------------------------------------------
ggplot( dftx, aes( x = X1426642_at, y = X1418765_at ))  +
  geom_point( aes( color = sampleColour), shape = 19 ) +
  geom_smooth( method = "loess" ) +
  scale_color_discrete( guide = "none" )
## `geom_smooth()` using formula = 'y ~ x'

## -----------------------------------------------------------------------------
library("mouse4302.db")



## -----------------------------------------------------------------------------
AnnotationDbi::select(mouse4302.db,
   keys = c("1426642_at", "1418765_at"), keytype = "PROBEID",
   columns = c("SYMBOL", "GENENAME"))
## 'select()' returned 1:1 mapping between keys and columns
##      PROBEID SYMBOL                                            GENENAME
## 1 1426642_at    Fn1                                       fibronectin 1
## 2 1418765_at  Timd2 T cell immunoglobulin and mucin domain containing 2
## -----------------------------------------------------------------------------
dfx = as.data.frame(Biobase::exprs(x))
ggplot(dfx, aes(x = `20 E3.25`)) + geom_histogram(binwidth = 0.2)

## -----------------------------------------------------------------------------
pb = ggplot(groups, aes(x = sampleGroup, y = n))

## -----------------------------------------------------------------------------
class(pb)
## [1] "gg"     "ggplot"
pb

## -----------------------------------------------------------------------------
pb = pb + geom_bar(stat = "identity")
pb = pb + aes(fill = sampleGroup)
pb = pb + theme(axis.text.x = element_text(angle = 90, hjust = 1))
pb = pb + scale_fill_manual(values = groupColor, name = "Groups")
pb

## -----------------------------------------------------------------------------
pb.polar = pb + coord_polar() +
  theme(axis.text.x = element_text(angle = 0, hjust = 1),
        axis.text.y = element_blank(),
        axis.ticks = element_blank()) +
  xlab("") + ylab("")
pb.polar

## -----------------------------------------------------------------------------
selectedProbes = c( Fgf4 = "1420085_at", Gata4 = "1418863_at",
                   Gata6 = "1425463_at",  Sox2 = "1416967_at")

## -----------------------------------------------------------------------------
## # How I found the selectedProbes:
## AnnotationDbi::select(mouse4302.db,
##    keys = c("Fgf4", "Sox2", "Gata6", "Gata4"), keytype = "SYMBOL",
##    columns = c("PROBEID"))

## -----------------------------------------------------------------------------
selectedProbes2 = AnnotationDbi::select(mouse4302.db,
   keys = selectedProbes, keytype = "PROBEID", columns = c("SYMBOL"))
## 'select()' returned 1:1 mapping between keys and columns
stopifnot(identical(sort(selectedProbes2$SYMBOL), sort(names(selectedProbes))),
          all(selectedProbes[selectedProbes2$SYMBOL] == selectedProbes2$PROBEID))
## -----------------------------------------------------------------------------
library("reshape2")
genes = melt(Biobase::exprs(x)[selectedProbes, ],
             varnames = c("probe", "sample"))

## -----------------------------------------------------------------------------
genes$gene =
  names(selectedProbes)[match(genes$probe, selectedProbes)]
head(genes)
##        probe  sample    value  gene
## 1 1420085_at 1 E3.25 3.027715  Fgf4
## 2 1418863_at 1 E3.25 4.843137 Gata4
## 3 1425463_at 1 E3.25 5.500618 Gata6
## 4 1416967_at 1 E3.25 1.731217  Sox2
## 5 1420085_at 2 E3.25 9.293016  Fgf4
## 6 1418863_at 2 E3.25 5.530016 Gata4
## -----------------------------------------------------------------------------
ggplot(genes, aes(x = gene, y = value)) +
  stat_summary(fun = mean, geom = "bar")

## -----------------------------------------------------------------------------
library("Hmisc")
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
## 
##     aml
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:xtable':
## 
##     label, label<-
## The following object is masked from 'package:AnnotationDbi':
## 
##     contents
## The following object is masked from 'package:Biobase':
## 
##     contents
## The following objects are masked from 'package:Biostrings':
## 
##     mask, translate
## The following objects are masked from 'package:base':
## 
##     format.pval, units
ggplot(genes, aes( x = gene, y = value, fill = gene)) +
  stat_summary(fun = mean, geom = "bar") +
  stat_summary(fun.data = mean_cl_normal, geom = "errorbar",
               width = 0.25)

## -----------------------------------------------------------------------------
p = ggplot(genes, aes( x = gene, y = value, fill = gene))
p + geom_boxplot()

## -----------------------------------------------------------------------------
p + geom_dotplot(binaxis = "y", binwidth = 1/6,
       stackdir = "center", stackratio = 0.75,
       aes(color = gene))

library("ggbeeswarm")
p + geom_beeswarm(aes(color = gene))

## ------------
#-----------------------------------------------------------------
ggplot(genes, aes( x = value, color = gene)) + geom_density()

## -----------------------------------------------------------------------------
p + geom_violin()

## -----------------------------------------------------------------------------
library("ggridges")
ggplot(genes, aes(x = value, y = gene, fill = gene)) + 
  geom_density_ridges()
## Picking joint bandwidth of 0.729

## -----------------------------------------------------------------------------
top42 = order(rowMeans(Biobase::exprs(x)), decreasing = TRUE)[1:42]
g42 = melt(Biobase::exprs(x)[rev(top42), ], varnames = c("probe", "sample"))
ggplot(g42, aes(x = value, y = probe)) 

## -----------------------------------------------------------------------------
ggplot(g42, aes(x = value, y = probe)) + 
  geom_density_ridges() + theme(legend.position = "none",
    axis.title.y = element_blank(), axis.text.y = element_blank(),
    axis.ticks.y = element_blank()) + xlim(13, 15) 
## Picking joint bandwidth of 0.0683
## Warning: Removed 35 rows containing non-finite values (`stat_density_ridges()`).

## -----------------------------------------------------------------------------
simdata = rnorm(70)
tibble(index = seq(along = simdata),
          sx = sort(simdata)) %>%
ggplot(aes(x = sx, y = index)) + geom_step()

## -----------------------------------------------------------------------------
ggplot(genes, aes( x = value, color = gene)) + stat_ecdf()

## -----------------------------------------------------------------------------
## # I used the functon bimodality_coefficient from the modes package to identify the most
## # bimodal looking array, number 64
## j0 = which.max(vapply(seq_len(ncol(x)), function(j){
##        modes  ::    bimodality_coefficient(Biobase::exprs(x)[, j])
##     }, numeric(1)))

## -----------------------------------------------------------------------------
ggplot(dfx, aes(x = `64 E4.5 (EPI)`)) + geom_histogram(bins = 100)

ggplot(dfx, aes(x = 2 ^ `64 E4.5 (EPI)`)) + 
  geom_histogram(binwidth = 20) + xlim(0, 1500)
## Warning: Removed 1457 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

## -----------------------------------------------------------------------------
scp = ggplot(dfx, aes(x = `59 E4.5 (PE)` ,
                      y = `92 E4.5 (FGF4-KO)`))
scp + geom_point()

## -----------------------------------------------------------------------------
scp  + geom_point(alpha = 0.1)

## -----------------------------------------------------------------------------
scp + geom_density2d()

## -----------------------------------------------------------------------------
scp + geom_density2d(h = 0.5, bins = 60)

library("RColorBrewer")
colorscale = scale_fill_gradientn(
    colors = rev(brewer.pal(9, "YlGnBu")),
    values = c(0, exp(seq(-5, 0, length.out = 100))))

scp + stat_density2d(h = 0.5, bins = 60,
          aes( fill = after_stat(level)), geom = "polygon") +
  colorscale + coord_fixed()

## -----------------------------------------------------------------------------
scp + geom_hex() + coord_fixed()

scp + geom_hex(binwidth = c(0.2, 0.2)) + colorscale +
  coord_fixed()

## -----------------------------------------------------------------------------
library("ggthemes")
sunsp = tibble(year   = time(sunspot.year),
               number = as.numeric(sunspot.year))
sp = ggplot(sunsp, aes(x = year, y = number)) + geom_line()
sp
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

ratio = with(sunsp, bank_slopes(year, number))
sp + coord_fixed(ratio = ratio)
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

## -----------------------------------------------------------------------------
library("magrittr")
dftx$lineage %<>% sub("^$", "no", .)
dftx$lineage %<>% factor(levels = c("no", "EPI", "PE", "FGF4-KO"))

ggplot(dftx, aes(x = X1426642_at, y = X1418765_at)) +
  geom_point() + facet_grid( . ~ lineage )

## -----------------------------------------------------------------------------
ggplot(dftx,
  aes(x = X1426642_at, y = X1418765_at)) + geom_point() +
   facet_grid( Embryonic.day ~ lineage )

## -----------------------------------------------------------------------------
ggplot(mutate(dftx, Tdgf1 = cut(X1450989_at, breaks = 4)),
   aes(x = X1426642_at, y = X1418765_at)) + geom_point() +
   facet_wrap( ~ Tdgf1, ncol = 2 )

## -----------------------------------------------------------------------------
library("plotly")
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
## 
##     subplot
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:igraph':
## 
##     groups
## The following object is masked from 'package:BSgenome':
## 
##     export
## The following object is masked from 'package:rtracklayer':
## 
##     export
## The following object is masked from 'package:XVector':
## 
##     slice
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(economics, x = ~ date, y = ~ unemploy / pop)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## -----------------------------------------------------------------------------
data("volcano")
volcanoData = list(
  x = 10 * seq_len(nrow(volcano)),
  y = 10 * seq_len(ncol(volcano)),
  z = volcano,
  col = terrain.colors(500)[cut(volcano, breaks = 500)]
)
library("rgl")
with(volcanoData, persp3d(x, y, z, color = col))
## -----------------------------------------------------------------------------
.volcanocut = cut(volcano, breaks = 500)
stopifnot(!any(is.na(.volcanocut)), all(as.integer(.volcanocut) %in% 1:500))
## -----------------------------------------------------------------------------
par(mai = rep(0, 4)) # A numerical vector of the form c(bottom, left, top, right) which gives the margin size specified in inches.
pie(rep(1, 8), col=1:8)

tibble(u = factor(1:8), v = 1)
## # A tibble: 8 × 2
##   u         v
##   <fct> <dbl>
## 1 1         1
## 2 2         1
## 3 3         1
## 4 4         1
## 5 5         1
## 6 6         1
## 7 7         1
## 8 8         1
## -----------------------------------------------------------------------------
ggplot(tibble(u = factor(1:8), v = 1), 
       aes(x = "",  y = v, fill = u)) +
  geom_bar(stat = "identity", width = 1) + 
  coord_polar("y", start = 0) + theme_void()

## -----------------------------------------------------------------------------
par(mai = c(0, 0.8, 0, 0))
display.brewer.all()

## -----------------------------------------------------------------------------
head(brewer.pal.info)
##      maxcolors category colorblind
## BrBG        11      div       TRUE
## PiYG        11      div       TRUE
## PRGn        11      div       TRUE
## PuOr        11      div       TRUE
## RdBu        11      div       TRUE
## RdGy        11      div      FALSE
table(brewer.pal.info$category)
## 
##  div qual  seq 
##    9    8   18
## -----------------------------------------------------------------------------
brewer.pal(4, "RdYlGn") #Creates nice looking color palettes especially for thematic maps
## [1] "#D7191C" "#FDAE61" "#A6D96A" "#1A9641"
## -----------------------------------------------------------------------------
par(mai = rep(0.1, 4))
mypalette  = colorRampPalette(
    c("darkorange3", "white","darkblue")
  )(100)
head(mypalette)
## [1] "#CD6600" "#CE6905" "#CF6C0A" "#D06F0F" "#D17214" "#D27519"
image(matrix(1:100, nrow = 100, ncol = 10), col = mypalette,
        xaxt = "n", yaxt = "n", useRaster = TRUE)

## -----------------------------------------------------------------------------
library("pheatmap")
topGenes = order(rowVars(Biobase::exprs(x)), decreasing = TRUE)[1:500]
rowCenter = function(x) { x - rowMeans(x) }
pheatmap( rowCenter(Biobase::exprs(x)[ topGenes, ] ),
  show_rownames = FALSE, show_colnames = FALSE,
  breaks = seq(-5, +5, length = 101),
  annotation_col =
    pData(x)[, c("sampleGroup", "Embryonic.day", "ScanDate") ],
  annotation_colors = list(
    sampleGroup = groupColor,
    genotype = c(`FGF4-KO` = "chocolate1", `WT` = "azure2"),
    Embryonic.day = setNames(brewer.pal(9, "Blues")[c(3, 6, 9)],
                             c("E3.25", "E3.5", "E4.5")),
    ScanDate = setNames(brewer.pal(nlevels(x$ScanDate), "YlGn"),
                        levels(x$ScanDate))
  ),
  cutree_rows = 4
)

## -----------------------------------------------------------------------------
groupColor[1]
##     E3.25 
## "#CAB2D6"
## -----------------------------------------------------------------------------
hexvals = sapply(1:3, function(i) substr(groupColor[1], i*2, i*2+1))
decvals = strtoi(paste0("0x", hexvals))
## -----------------------------------------------------------------------------
library("colorspace")
## 
## Attaching package: 'colorspace'
## The following object is masked from 'package:Gviz':
## 
##     coords
library("grid")

plothcl = function(h, c, l, what, x0 = 0.5, y0 = 0.5, default.units = "npc", ...) {
  switch(what,
         "c" = {
           stopifnot(length(l)==1)
           n = length(c)
         },
         "l" = {
           stopifnot(length(c)==1)
           n = length(l)
         },
         stop("Sapperlot"))

  cr = seq(0.1, 0.5, length = n+1)
  dr = 0.05 / n

  for (j in seq_len(n)) {
    r = c(cr[j]+dr, cr[j+1]-dr)
    for(i in 1:(length(h)-1)){
      phi = seq(h[i], h[i+1], by=1)/180*pi
      px = x0 + c(r[1]*cos(phi), r[2]*rev(cos(phi)))
      py = y0 + c(r[1]*sin(phi), r[2]*rev(sin(phi)))
      mycol = switch(what,
        "c" = hcl(h=mean(h[i+(0:1)]), c=c[j], l=l),
        "l" = hcl(h=mean(h[i+(0:1)]), c=c, l=l[j]))
      grid::grid.polygon(px, py, 
                         gp=gpar(col=mycol, fill=mycol),
                         default.units=default.units,...)
    }
  }
}


## -----------------------------------------------------------------------------
plothcl( h = seq(0, 360, by=3), c = seq(5, 75, by=10), l = 75,   what="c")

grid.newpage()
plothcl( h = seq(0, 360, by=3), c = 55, l = seq(20, 100, by=10), what="l")

## -----------------------------------------------------------------------------
gg = ggplot(tibble(A = Biobase::exprs(x)[, 1], M = rnorm(length(A))),
            aes(y = M))
gg + geom_point(aes(x = A), size = 0.2)

gg + geom_point(aes(x = rank(A)), size = 0.2)

## -----------------------------------------------------------------------------
volume = function(rho, nu)
            pi^(nu/2) * rho^nu / gamma(nu/2+1)

ggplot(tibble(nu    = 1:15,
  Omega = volume(1, nu)), aes(x = nu, y = Omega)) +
geom_line() +
xlab(expression(nu)) + ylab(expression(Omega)) +
geom_text(label =
"Omega(rho,nu)==frac(pi^frac(nu,2)~rho^nu, Gamma(frac(nu,2)+1))",
  parse = TRUE, x = 6, y = 1.5)

## -----------------------------------------------------------------------------
ggplot(genes, aes( x = value, color = gene)) + stat_ecdf() +
  theme(text = element_text(family = "serif"))

## -----------------------------------------------------------------------------
ggplot(genes, aes( x = value, color = gene)) + stat_ecdf() + theme(text = element_text(family = "Bauhaus 93"))

## -----------------------------------------------------------------------------
library("ggbio")
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## Need specific help about ggbio? try mailing 
##  the maintainer or visit https://lawremi.github.io/ggbio/
## 
## Attaching package: 'ggbio'
## The following object is masked from 'package:Hmisc':
## 
##     zoom
## The following objects are masked from 'package:ggplot2':
## 
##     geom_bar, geom_rect, geom_segment, ggsave, stat_bin, stat_identity,
##     xlim
data("hg19IdeogramCyto", package = "biovizBase")
plotIdeogram(hg19IdeogramCyto, subchr = "chr1")

## -----------------------------------------------------------------------------
library("GenomicRanges")
data("darned_hg19_subset500", package = "biovizBase")
autoplot(darned_hg19_subset500, layout = "karyogram",
         aes(color = exReg, fill = exReg))
## Warning in getIdeoGR(data): geom(ideogram) need valid seqlengths information for accurate mapping,
##                  now use reduced information as ideogram...

## Warning in getIdeoGR(data): geom(ideogram) need valid seqlengths information for accurate mapping,
##                  now use reduced information as ideogram...
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

## -----------------------------------------------------------------------------
data("ideoCyto", package = "biovizBase")
dn = darned_hg19_subset500
seqlengths(dn) = seqlengths(ideoCyto$hg19)[names(seqlengths(dn))]
dn = keepSeqlevels(dn, paste0("chr", c(1:22, "X")))
autoplot(dn, layout = "karyogram", aes(color = exReg, fill = exReg))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

## -----------------------------------------------------------------------------
darned_hg19_subset500[1:2,]
## GRanges object with 2 ranges and 10 metadata columns:
##       seqnames    ranges strand |       inchr       inrna         snp
##          <Rle> <IRanges>  <Rle> | <character> <character> <character>
##   [1]     chr5  86618225      - |           A           I        <NA>
##   [2]     chr7  99792382      - |           A           I        <NA>
##              gene      seqReg       exReg      source      ests      esta
##       <character> <character> <character> <character> <integer> <integer>
##   [1]        <NA>           O        <NA>    amygdala         0         0
##   [2]        <NA>           O        <NA>        <NA>         0         0
##            author
##       <character>
##   [1]    15342557
##   [2]    15342557
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths
## -----------------------------------------------------------------------------
stopifnot(is(darned_hg19_subset500, "GRanges"), identical(start(darned_hg19_subset500),end(darned_hg19_subset500)))

## -----------------------------------------------------------------------------
ggcars = ggplot(mtcars, aes(x = hp, y = mpg)) + geom_point(size = 5, color = 'red')
ggcars

ggcars + theme_bw()

ggcars + theme_minimal()

4 Mixture Models

## -----------------------------------------------------------------------------
coinflips = (runif(10000) > 0.5)
table(coinflips)
## coinflips
## FALSE  TRUE 
##  4992  5008
oneFlip = function(fl, mean1 = 1, mean2 = 3, sd1 = 0.5, sd2 = 0.5) {
  if (fl) {
   rnorm(1, mean1, sd1)
  } else {
   rnorm(1, mean2, sd2)
  }
}
fairmix = vapply(coinflips, oneFlip, numeric(1))
library("ggplot2")
library("dplyr")
ggplot(tibble(value = fairmix), aes(x = value)) +
     geom_histogram(fill = "purple", binwidth = 0.1)

## -----------------------------------------------------------------------------
means = c(1, 3)
sds   = c(0.5, 0.5)
values = rnorm(length(coinflips),
               mean = ifelse(coinflips, means[1], means[2]),
               sd   = ifelse(coinflips, sds[1],   sds[2]))



## -----------------------------------------------------------------------------
fair = tibble(
  coinflips = (runif(1e6) > 0.5),
  values = rnorm(length(coinflips),
                 mean = ifelse(coinflips, means[1], means[2]),
                 sd   = ifelse(coinflips, sds[1],   sds[2])))
ggplot(fair, aes(x = values)) +
     geom_histogram(fill = "purple", bins = 200)

## -----------------------------------------------------------------------------
ggplot(dplyr::filter(fair, coinflips), aes(x = values)) +
  geom_histogram(aes(y = after_stat(density)), fill = "purple", binwidth = 0.01) +
  stat_function(fun = dnorm, color = "red",
                args = list(mean = means[1], sd = sds[1]))

## -----------------------------------------------------------------------------
fairtheory = tibble(
  x = seq(-1, 5, length.out = 1000),
  f = 0.5 * dnorm(x, mean = means[1], sd = sds[1]) +
      0.5 * dnorm(x, mean = means[2], sd = sds[2]))
ggplot(fairtheory, aes(x = x, y = f)) +
  geom_line(color = "red", linewidth = 1.5) + ylab("mixture density")

## -----------------------------------------------------------------------------
mystery = tibble(
  coinflips = (runif(1e3) > 0.5),
  values = rnorm(length(coinflips),
                 mean = ifelse(coinflips, 1, 2),
                 sd   = ifelse(coinflips, sqrt(.5), sqrt(.5))))
br2 = with(mystery, seq(min(values), max(values), length.out = 30))
ggplot(mystery, aes(x = values)) +
geom_histogram(fill = "purple", breaks = br2)

## -----------------------------------------------------------------------------
head(mystery, 3)
## # A tibble: 3 × 2
##   coinflips values
##   <lgl>      <dbl>
## 1 FALSE      2.83 
## 2 TRUE       1.31 
## 3 TRUE       0.404
br = with(mystery, seq(min(values), max(values), length.out = 30))
ggplot(mystery, aes(x = values)) +
  geom_histogram(data = dplyr::filter(mystery, coinflips),
     fill = "red", alpha = 0.2, breaks = br) +
  geom_histogram(data = dplyr::filter(mystery, !coinflips),
     fill = "darkblue", alpha = 0.2, breaks = br) 

## -----------------------------------------------------------------------------
stopifnot(identical(br2, br))

## -----------------------------------------------------------------------------
ggplot(mystery, aes(x = values, fill = coinflips)) +
  geom_histogram(data = dplyr::filter(mystery, coinflips),
     fill = "red", alpha = 0.2, breaks = br) +
  geom_histogram(data = dplyr::filter(mystery, !coinflips),
     fill = "darkblue", alpha = 0.2, breaks = br) +
  geom_histogram(fill = "purple", breaks = br, alpha = 0.2)

## -----------------------------------------------------------------------------
mus = c(-0.5, 1.5)
lambda = 0.5
u = sample(2, size = 100, replace = TRUE, prob = c(lambda, 1-lambda))
x = rnorm(length(u), mean = mus[u])
dux = tibble(u, x)
head(dux)
## # A tibble: 6 × 2
##       u      x
##   <int>  <dbl>
## 1     2  0.599
## 2     1 -2.98 
## 3     2  1.18 
## 4     1 -0.825
## 5     2  1.67 
## 6     2  2.30
## -----------------------------------------------------------------------------
group_by(dux, u) |> summarise(mu = mean(x), sigma = sd(x))
## # A tibble: 2 × 3
##       u     mu sigma
##   <int>  <dbl> <dbl>
## 1     1 -0.580 0.927
## 2     2  1.47  0.974
table(dux$u) / nrow(dux)
## 
##    1    2 
## 0.56 0.44
## -----------------------------------------------------------------------------
.o = options(digits = 3)
library("mixtools")
## mixtools package, version 2.0.0, Released 2022-12-04
## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772 and the Chan Zuckerberg Initiative: Essential Open Source Software for Science (Grant No. 2020-255193).
## 
## Attaching package: 'mixtools'
## The following object is masked from 'package:gtools':
## 
##     ddirichlet
## The following object is masked from 'package:grid':
## 
##     depth
y = c(rnorm(100, mean = -0.2, sd = 0.5),
      rnorm( 50, mean =  0.5, sd =   1))
gm = normalmixEM(y, k = 2, 
                    lambda = c(0.5, 0.5),
                    mu = c(-0.01, 0.01), 
                    sigma = c(3, 3))
## number of iterations= 296
with(gm, c(lambda, mu, sigma, loglik))
## [1]    0.776    0.224   -0.185    0.823    0.534    0.989 -164.826
options(.o)
gm$lambda
## [1] 0.775846 0.224154
## -----------------------------------------------------------------------------
## PROVENANCE: here's a record of how the data were created
library("mosaics")
## Loading required package: Rcpp
## 
## Attaching package: 'mosaics'
## The following object is masked from 'package:plotly':
## 
##     export
## The following object is masked from 'package:BSgenome':
## 
##     export
## The following object is masked from 'package:rtracklayer':
## 
##     export
library("mosaicsExample")
setwd("D:/books/R/blogdown/hugo")
for(f in c("wgEncodeSydhTfbsGm12878Stat1StdAlnRep1_chr22_sorted.bam",
           "wgEncodeSydhTfbsGm12878InputStdAlnRep1_chr22_sorted.bam"))
  constructBins(infile = system.file(file.path("extdata", f), package="mosaicsExample"),
    fileFormat = "bam", outfileLoc = "../data/",
    byChr = FALSE, useChrfile = FALSE, chrfile = NULL, excludeChr = NULL,
    PET = FALSE, fragLen = 200, binSize = 200, capping = 0)
## ------------------------------------------------------------
## Info: setting summary
## ------------------------------------------------------------
## Name of aligned read file: C:/Users/lidan/R/library/mosaicsExample/extdata/wgEncodeSydhTfbsGm12878Stat1StdAlnRep1_chr22_sorted.bam 
## Aligned read file format: BAM 
## Directory of processed bin-level files: ../data/ 
## Construct bin-level files by chromosome? N 
## Is file for chromosome info provided? N 
## Data type: Single-end tag (SET)
## Average fragment length: 200 
## Bin size: 200 
## ------------------------------------------------------------
## Use the provided BAM index file.
## Chromosome information is extracted from the BAM file.
## Info: reading the aligned read file and processing it into bin-level files...
## Info: done!
## ------------------------------------------------------------
## Info: processing summary
## ------------------------------------------------------------
## Directory of processed bin-level files: ../data/ 
## Processed bin-level file: wgEncodeSydhTfbsGm12878Stat1StdAlnRep1_chr22_sorted.bam_fragL200_bin200.txt
## Sequencing depth: 231822 
## ------------------------------------------------------------
## ------------------------------------------------------------
## Info: setting summary
## ------------------------------------------------------------
## Name of aligned read file: C:/Users/lidan/R/library/mosaicsExample/extdata/wgEncodeSydhTfbsGm12878InputStdAlnRep1_chr22_sorted.bam 
## Aligned read file format: BAM 
## Directory of processed bin-level files: ../data/ 
## Construct bin-level files by chromosome? N 
## Is file for chromosome info provided? N 
## Data type: Single-end tag (SET)
## Average fragment length: 200 
## Bin size: 200 
## ------------------------------------------------------------
## Use the provided BAM index file.
## Chromosome information is extracted from the BAM file.
## Info: reading the aligned read file and processing it into bin-level files...
## Info: done!
## ------------------------------------------------------------
## Info: processing summary
## ------------------------------------------------------------
## Directory of processed bin-level files: ../data/ 
## Processed bin-level file: wgEncodeSydhTfbsGm12878InputStdAlnRep1_chr22_sorted.bam_fragL200_bin200.txt
## Sequencing depth: 182605 
## ------------------------------------------------------------
datafiles = c("../data/wgEncodeSydhTfbsGm12878Stat1StdAlnRep1_chr22_sorted.bam_fragL200_bin200.txt",
              "../data/wgEncodeSydhTfbsGm12878InputStdAlnRep1_chr22_sorted.bam_fragL200_bin200.txt")
binTFBS = readBins(type = c("chip", "input"), fileName = datafiles)
## Info: reading and preprocessing bin-level data...
## Info: data contains only one chromosome.
## Info: done!
## -----------------------------------------------------------------------------
binTFBS
## Summary: bin-level data (class: BinData)
## ----------------------------------------
## - # of chromosomes in the data: 1
## - total effective tag counts: 462479
##   (sum of ChIP tag counts of all bins)
## - control sample is incorporated
## - mappability score is NOT incorporated
## - GC content score is NOT incorporated
## - uni-reads are assumed
## ----------------------------------------
## -----------------------------------------------------------------------------
bincts = print(binTFBS)
ggplot(bincts, aes(x = tagCount)) +
  geom_histogram(binwidth = 1, fill = "forestgreen")

## -----------------------------------------------------------------------------
ggplot(bincts, aes(x = tagCount)) + scale_y_log10() +
   geom_histogram(binwidth = 1, fill = "forestgreen")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 17 rows containing missing values (`geom_bar()`).

## -----------------------------------------------------------------------------
masses = c(A =  331, C =  307, G =  347, T =  322)
probs  = c(A = 0.12, C = 0.38, G = 0.36, T = 0.14)
N  = 7000
sd = 3
nuclt   = sample(length(probs), N, replace = TRUE, prob = probs)
quadwts = rnorm(length(nuclt),
                mean = masses[nuclt],
                sd   = sd)
ggplot(tibble(quadwts = quadwts), aes(x = quadwts)) +
  geom_histogram(bins = 100, fill = "purple")

## -----------------------------------------------------------------------------
library("HistData")
ZeaMays$diff
##  [1]  6.125 -8.375  1.000  2.000  0.750  2.875  3.500  5.125  1.750  3.625
## [11]  7.000  3.000  9.375  7.500 -6.000
ggplot(ZeaMays, aes(x = diff, ymax = 1/15, ymin = 0)) +
  geom_linerange(linewidth = 1, col = "forestgreen") + ylim(0, 0.1)

## -----------------------------------------------------------------------------
stopifnot(nrow(ZeaMays) == 15)
## -----------------------------------------------------------------------------
B = 1000
meds = replicate(B, {
  i = sample(15, 15, replace = TRUE)
  median(ZeaMays$diff[i])
})
ggplot(tibble(medians = meds), aes(x = medians)) +
  geom_histogram(bins = 30, fill = "purple")

## -----------------------------------------------------------------------------
library("bootstrap")
bootstrap(ZeaMays$diff, B, mean)
## $thetastar
##    [1]  1.25833333  1.20833333  4.65000000  3.54166667  4.99166667  4.42500000
##    [7]  2.72500000  3.58333333  1.50833333  2.76666667  3.03333333  1.26666667
##   [13]  2.40833333  3.20000000  1.21666667  3.87500000  3.76666667  2.67500000
##   [19]  2.43333333  2.00833333  1.95000000  3.16666667  2.55833333  1.97500000
##   [25]  1.45000000  1.21666667  2.21666667  2.68333333 -0.70833333  2.12500000
##   [31]  2.87500000  4.55833333  5.39166667  3.76666667  1.94166667  2.76666667
##   [37]  1.30833333  1.85000000  2.95000000  2.76666667  4.50833333  1.97500000
##   [43]  2.50000000  3.17500000  3.01666667 -0.57500000  4.51666667  2.35000000
##   [49]  3.77500000  3.44166667  2.87500000  3.52500000  2.64166667  2.34166667
##   [55]  2.08333333  3.15833333  3.07500000  3.54166667  1.67500000  1.67500000
##   [61]  2.93333333  4.44166667  3.05000000  1.33333333  2.06666667  3.25833333
##   [67]  1.75833333  3.84166667  3.29166667  6.08333333  1.18333333  3.01666667
##   [73]  2.39166667  3.65833333  2.55833333  0.34166667  0.64166667  4.47500000
##   [79]  0.85833333  4.06666667  2.87500000  1.67500000  1.94166667  2.33333333
##   [85]  3.87500000  5.15833333  2.90000000  2.34166667  3.15000000  1.29166667
##   [91]  1.75833333  1.79166667  3.77500000  1.62500000  2.16666667  1.76666667
##   [97]  0.77500000  2.54166667  2.65833333  4.29166667  2.05000000  1.73333333
##  [103]  1.64166667  3.80000000  3.78333333  0.85833333  2.92500000  2.65000000
##  [109]  2.13333333  2.56666667  3.02500000  2.16666667  2.41666667  0.22500000
##  [115]  2.88333333  2.32500000  0.61666667  1.54166667  2.59166667  0.24166667
##  [121]  2.10833333  1.12500000  3.28333333  3.15833333  1.90000000  3.86666667
##  [127]  1.96666667  2.14166667  2.96666667  2.04166667  1.71666667  2.45000000
##  [133]  2.24166667  2.82500000  2.12500000  3.18333333  4.08333333  1.69166667
##  [139]  2.71666667  1.55000000  3.91666667  3.22500000  1.89166667  3.10000000
##  [145]  2.30833333  2.67500000  2.20000000  2.03333333  2.93333333  3.00000000
##  [151]  2.59166667  2.94166667  1.03333333  0.91666667  2.57500000  2.75833333
##  [157]  0.93333333  4.00000000  1.35833333  2.20833333  0.07500000  2.84166667
##  [163]  4.78333333  1.48333333  3.65833333  2.98333333  1.95833333  2.38333333
##  [169]  2.31666667  3.17500000  1.23333333  3.90833333  4.21666667  2.07500000
##  [175]  4.10833333  2.65833333  3.67500000  2.50000000  2.26666667  2.63333333
##  [181]  2.43333333  2.65000000  2.60833333  3.56666667  3.54166667  0.36666667
##  [187]  4.27500000  3.91666667  2.19166667  1.62500000  2.97500000  1.30833333
##  [193]  3.41666667  2.04166667  2.03333333  3.84166667  4.16666667  2.23333333
##  [199]  2.40833333  4.95000000  3.50833333  4.05000000  4.12500000  2.76666667
##  [205]  3.64166667  3.40833333  1.25833333  2.15000000  1.38333333  5.69166667
##  [211]  1.63333333  3.37500000  0.01666667  2.44166667  3.20833333  1.19166667
##  [217]  3.54166667  4.90000000  1.45000000  3.55000000  3.58333333  0.61666667
##  [223]  1.50833333  3.80833333  4.55833333  3.56666667  1.41666667  0.20833333
##  [229]  2.47500000  3.00000000 -0.12500000  4.04166667  2.21666667  2.25000000
##  [235]  3.34166667  3.28333333  3.26666667  4.86666667  2.81666667  4.79166667
##  [241]  3.07500000  3.37500000  2.05833333  0.69166667  4.61666667  2.49166667
##  [247]  2.99166667  2.61666667  2.50833333  1.86666667  3.28333333  2.70833333
##  [253]  3.65833333  3.15000000  2.63333333  3.43333333  1.77500000  3.93333333
##  [259]  1.95833333  4.55000000  3.34166667  3.55000000  3.25000000  2.84166667
##  [265]  2.92500000  1.67500000  2.74166667  4.34166667  4.18333333  4.20833333
##  [271]  3.30833333  3.80833333  5.23333333  0.60833333  3.82500000  0.86666667
##  [277]  2.66666667  5.10000000  4.15833333  1.69166667  1.37500000  3.85000000
##  [283]  4.29166667  3.48333333 -0.65000000  0.22500000  4.25833333  2.15000000
##  [289]  3.50000000  2.70833333  0.27500000  0.61666667  2.40000000  1.64166667
##  [295]  1.21666667  4.58333333  3.03333333  3.62500000  1.61666667  3.00833333
##  [301]  4.40833333  3.41666667  2.27500000  1.97500000  2.71666667  1.70833333
##  [307]  1.96666667  3.05000000 -0.25000000  2.65833333  3.31666667  4.12500000
##  [313]  3.30000000  3.90833333  1.51666667  2.40000000  1.22500000  2.85000000
##  [319]  3.38333333  3.10000000  2.65833333  2.63333333  1.52500000  3.36666667
##  [325]  2.53333333  1.84166667  3.12500000  0.67500000 -0.90833333  1.40833333
##  [331]  3.35000000  5.58333333  3.05000000  2.00833333  0.97500000  1.49166667
##  [337] -0.47500000  2.35000000  3.89166667  3.69166667  3.20833333  3.85000000
##  [343]  5.30000000  2.30833333  5.37500000  4.30000000  0.57500000  1.67500000
##  [349]  2.25833333  5.06666667  3.30000000 -0.85000000  2.00833333  4.43333333
##  [355]  2.55833333  3.45833333  4.34166667  2.35833333  2.60833333  1.59166667
##  [361]  2.55000000  0.44166667  4.05000000  1.53333333  2.53333333  2.90833333
##  [367]  0.12500000  2.46666667  2.52500000  3.09166667  1.35833333  2.09166667
##  [373]  1.68333333  3.40000000  3.79166667  3.95833333  2.60000000  2.70000000
##  [379]  4.67500000  2.94166667  3.00833333  1.39166667  2.32500000  0.94166667
##  [385]  2.29166667  0.52500000  2.99166667  1.29166667  2.10000000  2.66666667
##  [391]  2.75000000  1.93333333  4.59166667  3.38333333  4.09166667  3.37500000
##  [397]  1.99166667  3.46666667  3.50833333  5.14166667  3.26666667  2.88333333
##  [403]  0.76666667  2.50000000  3.63333333  2.95000000  2.48333333  3.17500000
##  [409]  1.51666667  3.01666667  1.26666667  2.46666667  3.22500000  1.15000000
##  [415]  2.65000000  3.34166667  1.51666667  2.41666667  4.85833333  2.99166667
##  [421]  3.53333333  1.25833333 -0.84166667  3.47500000  4.37500000  1.99166667
##  [427]  4.25000000  2.19166667  3.95833333  4.28333333  3.90000000  2.68333333
##  [433]  2.45833333  3.42500000  3.22500000  3.25833333  3.91666667  0.41666667
##  [439]  2.90833333  2.69166667  2.92500000  3.27500000  2.25833333  1.00833333
##  [445]  3.00000000  1.80833333  2.10000000  3.69166667  2.36666667  2.11666667
##  [451]  2.56666667  3.63333333  2.21666667  2.70833333  2.60833333  4.25833333
##  [457]  3.55000000  1.90833333 -0.55000000  2.74166667  2.22500000  1.67500000
##  [463]  1.05000000  0.69166667  1.97500000  2.94166667  5.25833333  1.12500000
##  [469]  1.63333333  1.97500000  3.52500000  2.49166667  4.01666667  1.41666667
##  [475]  3.60000000  3.68333333  3.75833333  2.92500000  1.51666667  2.88333333
##  [481]  3.10833333  2.66666667  3.35000000  2.38333333  3.57500000  3.01666667
##  [487]  2.81666667  4.58333333  2.35833333  2.56666667  4.08333333  1.29166667
##  [493]  2.09166667  0.50000000  1.56666667  2.15000000  3.26666667  0.96666667
##  [499]  2.02500000  4.88333333  3.94166667  2.92500000  2.94166667  3.00833333
##  [505]  0.85833333  2.54166667  2.10833333  2.85833333  0.70833333  3.40833333
##  [511]  1.87500000  2.15000000  1.10000000  3.08333333  1.20000000  3.17500000
##  [517]  2.18333333  3.40000000  4.69166667  3.90833333  1.42500000  3.49166667
##  [523]  4.09166667  2.79166667  2.11666667  3.32500000  4.50000000  2.66666667
##  [529]  3.25000000  4.15000000  3.33333333  0.89166667  2.32500000  1.79166667
##  [535]  3.26666667  1.16666667  1.86666667  1.00000000  4.55000000  3.02500000
##  [541]  2.62500000  2.48333333  3.44166667  3.55833333  0.91666667  3.63333333
##  [547]  2.55833333  1.10833333  1.34166667  1.82500000  2.45000000  2.98333333
##  [553]  1.35000000  4.59166667  1.90833333  3.72500000  3.65833333  1.50000000
##  [559]  2.19166667  2.70000000  3.58333333  3.17500000  1.55833333  0.54166667
##  [565]  3.08333333  2.03333333  0.88333333  2.52500000  2.19166667  3.15000000
##  [571]  3.20833333  2.84166667  3.15833333  3.59166667  1.25833333  4.67500000
##  [577]  2.14166667  3.56666667  0.00000000  1.89166667  3.52500000  2.68333333
##  [583]  2.38333333  3.22500000  2.37500000  1.44166667  2.20833333  4.70833333
##  [589]  2.70833333  4.06666667  4.26666667  3.65833333  1.03333333  2.77500000
##  [595]  4.81666667  2.10833333  0.52500000  3.55833333  1.12500000  1.20000000
##  [601]  3.45833333  4.20833333  4.34166667  2.04166667 -1.26666667  2.60000000
##  [607]  2.59166667  2.42500000  0.75000000  4.22500000  4.20000000  4.00000000
##  [613]  4.33333333  2.53333333  3.58333333  3.93333333  3.60000000  3.17500000
##  [619]  4.30000000  4.45000000  2.95833333  3.42500000  4.23333333  3.53333333
##  [625]  1.77500000  1.10833333  4.49166667  2.44166667  0.25833333  3.30000000
##  [631]  1.74166667  2.03333333  1.49166667  2.36666667  3.25000000  1.79166667
##  [637]  0.10000000  4.47500000  3.29166667  3.50000000  2.07500000  0.61666667
##  [643]  3.77500000  5.78333333  3.40833333  1.35000000  3.32500000  3.08333333
##  [649]  2.10000000  2.27500000  0.65833333  2.29166667  1.84166667  2.51666667
##  [655]  4.40833333  3.86666667  3.22500000  0.32500000  2.17500000  4.22500000
##  [661]  2.74166667  2.69166667  0.13333333  0.12500000  3.56666667  1.45000000
##  [667]  2.26666667  2.93333333  0.90000000  2.33333333  4.63333333  2.66666667
##  [673]  4.13333333  3.27500000  2.57500000  3.19166667  2.96666667  4.02500000
##  [679]  1.94166667  2.60000000  0.63333333  2.00833333  3.68333333  2.49166667
##  [685]  3.82500000  0.88333333  1.97500000  2.42500000  0.47500000  0.53333333
##  [691] -0.08333333  0.13333333  4.39166667  2.42500000 -1.14166667  1.49166667
##  [697]  3.35000000  1.89166667  3.59166667 -0.43333333  3.47500000  3.80000000
##  [703]  0.99166667 -0.22500000  2.13333333  2.25000000  1.20833333  3.01666667
##  [709]  3.32500000  2.74166667  3.06666667  2.40000000  2.31666667  4.16666667
##  [715]  2.68333333  1.60000000  2.82500000  1.97500000  3.42500000  3.00000000
##  [721]  1.23333333  2.12500000  1.20000000  2.21666667  3.90000000  1.87500000
##  [727]  0.67500000  1.81666667  3.66666667  0.95833333  4.50000000  3.23333333
##  [733]  3.20000000  1.30833333  4.86666667 -0.34166667  4.55833333  1.78333333
##  [739]  1.77500000  1.38333333  3.90833333  2.20833333  3.71666667  3.01666667
##  [745]  3.02500000  1.40000000  3.41666667  2.14166667  4.77500000  2.48333333
##  [751]  4.45000000  0.80833333  1.97500000  3.12500000  3.25000000  4.17500000
##  [757]  3.14166667  1.31666667  2.00833333  1.92500000  3.19166667  0.35833333
##  [763]  3.58333333  3.41666667  2.93333333  2.79166667  1.18333333  3.39166667
##  [769]  0.55000000  2.75833333  0.52500000  1.05000000  2.94166667  2.45000000
##  [775]  1.75833333  3.30000000  2.44166667  3.08333333  1.19166667  2.81666667
##  [781]  3.93333333  3.05833333  3.60833333  4.02500000  1.44166667  3.24166667
##  [787]  1.30000000  2.85000000  3.67500000  2.78333333  2.43333333  4.15833333
##  [793]  4.05000000  2.50000000  1.61666667  3.33333333  2.10000000  3.09166667
##  [799]  2.08333333  3.43333333  3.90833333  1.08333333  3.13333333  2.65000000
##  [805]  2.01666667  1.95000000  2.37500000  2.84166667  4.04166667  1.87500000
##  [811]  1.34166667  2.30833333  3.70833333  3.65000000  0.40833333  2.77500000
##  [817]  2.46666667  4.18333333  4.23333333  2.10833333  2.36666667  2.50000000
##  [823]  4.20000000  4.35833333  2.56666667  1.65000000  5.14166667  1.50833333
##  [829]  3.07500000  1.78333333  3.35833333  3.90000000  3.32500000  4.84166667
##  [835]  1.13333333  4.21666667  2.15833333  0.88333333  4.14166667  1.30000000
##  [841]  4.26666667  1.45000000  2.85833333  4.74166667  2.32500000  2.17500000
##  [847]  0.96666667  2.26666667  0.82500000  3.65833333  2.43333333  3.16666667
##  [853]  3.42500000  0.75000000  1.23333333  3.84166667  2.15833333  3.18333333
##  [859]  2.95833333  4.20833333  3.40000000  1.93333333  3.42500000  2.59166667
##  [865]  1.72500000  2.55833333  4.05000000  1.30833333  3.15000000  0.90000000
##  [871]  3.23333333  2.85000000  2.37500000  2.65000000  3.27500000  0.26666667
##  [877]  1.58333333  2.70833333  4.80833333  0.43333333  2.82500000  2.31666667
##  [883]  3.00000000  2.45833333  1.27500000  3.09166667  2.14166667  1.66666667
##  [889]  0.90000000  3.46666667  3.25833333  3.70000000  1.13333333  5.33333333
##  [895]  3.02500000  2.50833333  2.34166667  0.80833333  1.72500000  3.57500000
##  [901]  3.54166667  3.84166667  2.02500000 -0.55000000  2.35000000  0.91666667
##  [907]  3.20000000  3.25000000  3.24166667  4.74166667  3.18333333  2.32500000
##  [913]  2.25000000  3.21666667  2.45833333  3.54166667  2.74166667  3.07500000
##  [919]  1.81666667  1.03333333  3.21666667 -0.52500000  1.75833333  1.61666667
##  [925]  2.15833333  5.04166667  2.04166667  2.92500000  2.85000000  3.14166667
##  [931]  3.21666667  2.18333333  2.37500000  2.94166667  3.62500000  5.34166667
##  [937]  3.23333333  3.82500000  2.50833333  5.04166667  1.78333333 -0.11666667
##  [943]  3.26666667  3.46666667  2.75000000  2.83333333  2.39166667  1.66666667
##  [949]  3.23333333  3.32500000  1.54166667  0.64166667  1.30833333  1.50000000
##  [955]  0.83333333  4.55000000  1.75833333  2.72500000  3.76666667  2.64166667
##  [961]  1.17500000  3.07500000  1.50833333  3.30000000  0.59166667  2.29166667
##  [967]  2.55000000  4.30833333  3.80000000  3.93333333  2.84166667  1.87500000
##  [973]  3.39166667  3.21666667  4.09166667  3.98333333  0.53333333  3.15833333
##  [979]  3.41666667  4.19166667  2.39166667  1.50833333  2.68333333  1.90000000
##  [985]  1.15000000  3.95833333  0.71666667  1.94166667  2.49166667  1.59166667
##  [991]  4.83333333  3.15000000  2.30833333  2.49166667  4.42500000  2.40000000
##  [997]  2.55000000  4.45000000  2.66666667  0.92500000
## 
## $func.thetastar
## NULL
## 
## $jack.boot.val
## NULL
## 
## $jack.boot.se
## NULL
## 
## $call
## bootstrap(x = ZeaMays$diff, nboot = B, theta = mean)
bootstrap(ZeaMays$diff, B, median)
## $thetastar
##    [1] 2.875 2.875 3.000 5.125 3.000 3.500 3.500 2.875 3.500 2.875 2.000 2.000
##   [13] 3.000 3.000 3.500 2.000 2.875 3.000 1.000 3.500 3.000 3.000 3.000 2.875
##   [25] 2.000 3.625 3.000 2.875 1.750 5.125 3.500 5.125 3.625 2.875 3.625 3.625
##   [37] 2.875 2.000 3.500 6.125 3.500 3.000 3.000 3.625 3.625 2.875 3.500 2.000
##   [49] 3.000 1.750 2.875 3.625 6.125 2.875 3.500 1.000 5.125 2.000 1.750 5.125
##   [61] 3.000 2.875 2.875 3.000 5.125 2.000 3.625 3.000 3.500 3.000 3.000 3.000
##   [73] 3.000 3.000 2.875 2.875 3.000 2.875 2.875 2.875 3.625 3.500 3.000 2.000
##   [85] 2.875 3.500 2.875 3.000 1.750 2.875 3.500 2.000 3.000 2.000 1.000 2.875
##   [97] 6.125 6.125 2.875 1.750 6.125 3.500 2.875 3.000 2.875 6.125 3.625 2.875
##  [109] 2.000 3.000 3.500 2.875 2.000 2.875 2.875 3.625 1.000 2.875 3.625 2.875
##  [121] 1.750 2.000 3.500 3.500 1.750 3.625 7.000 3.500 3.000 2.875 2.000 2.875
##  [133] 2.000 2.875 2.875 3.000 2.000 1.750 2.000 1.750 2.000 1.750 3.500 3.625
##  [145] 2.875 1.750 2.875 3.500 3.000 3.625 3.500 3.500 1.750 5.125 2.875 2.875
##  [157] 3.500 1.750 3.000 2.000 2.000 2.875 3.500 3.000 3.500 3.625 3.000 2.875
##  [169] 5.125 1.750 2.000 3.000 3.000 3.500 3.000 5.125 6.125 3.000 3.625 2.875
##  [181] 2.875 3.000 2.875 3.500 3.500 3.000 3.500 2.875 1.750 5.125 3.500 2.000
##  [193] 2.000 3.000 3.625 2.875 2.875 3.000 3.000 3.000 3.625 2.875 2.000 3.500
##  [205] 2.000 3.000 3.500 2.000 6.125 2.875 2.875 3.625 3.000 2.875 3.000 2.000
##  [217] 3.500 3.625 2.000 3.625 3.625 3.000 3.500 3.500 0.750 3.000 5.125 5.125
##  [229] 3.000 3.625 2.000 1.750 6.125 3.500 2.875 3.000 1.750 3.625 2.875 2.875
##  [241] 2.000 3.500 3.000 2.875 1.000 2.875 3.625 2.000 2.875 3.000 2.875 3.500
##  [253] 2.875 3.500 3.000 1.750 5.125 3.625 2.000 3.500 2.875 3.000 2.875 3.000
##  [265] 5.125 2.875 1.000 5.125 3.000 3.000 2.875 3.000 3.000 2.875 3.500 2.875
##  [277] 3.500 3.500 3.000 1.000 2.875 2.000 3.500 1.750 5.125 3.500 2.875 3.625
##  [289] 0.750 1.750 6.125 3.000 3.000 3.625 2.875 3.000 2.000 3.000 3.000 3.000
##  [301] 2.875 3.000 3.625 3.500 2.000 2.000 3.000 1.750 2.000 3.000 3.500 3.500
##  [313] 3.625 3.500 6.125 3.000 5.125 5.125 3.500 2.875 3.500 3.625 2.875 3.000
##  [325] 2.000 7.000 2.875 3.000 2.000 3.625 3.625 2.000 2.000 3.500 2.875 2.000
##  [337] 2.875 2.000 2.875 3.000 2.000 3.000 3.000 2.875 3.500 2.875 3.000 2.875
##  [349] 2.000 5.125 3.625 1.750 3.000 1.000 2.875 1.000 2.000 3.500 3.500 3.000
##  [361] 1.000 3.500 2.000 2.875 3.625 1.750 3.625 2.875 3.000 3.500 3.500 3.500
##  [373] 3.500 3.000 3.000 2.000 3.500 3.000 6.125 5.125 2.875 3.000 3.625 3.625
##  [385] 3.500 3.000 1.750 3.000 3.625 3.000 2.000 3.000 2.875 5.125 1.750 2.875
##  [397] 3.500 3.500 3.000 2.000 3.000 2.000 2.875 2.875 3.500 2.000 2.875 2.875
##  [409] 3.000 3.500 2.875 2.000 3.000 3.500 3.000 3.500 2.875 5.125 2.875 2.875
##  [421] 5.125 3.625 3.500 1.000 5.125 3.625 3.000 1.750 2.000 3.625 2.000 2.000
##  [433] 3.625 2.875 3.000 3.000 3.000 3.625 1.750 3.000 3.625 3.000 2.875 3.500
##  [445] 3.625 5.125 3.000 1.000 2.875 3.625 3.000 5.125 1.000 6.125 3.000 3.500
##  [457] 1.750 3.500 1.000 1.750 1.000 6.125 3.500 2.875 5.125 6.125 2.000 3.500
##  [469] 5.125 2.000 3.500 3.625 2.875 5.125 2.875 1.750 2.875 2.875 3.625 2.000
##  [481] 3.500 1.000 3.000 1.000 1.750 6.125 3.500 3.625 3.500 2.875 3.500 2.000
##  [493] 2.000 2.875 2.000 2.875 1.750 2.875 3.625 3.500 3.000 3.000 5.125 3.625
##  [505] 3.625 3.000 3.625 1.750 2.000 3.000 3.500 2.000 3.000 2.875 2.875 2.000
##  [517] 3.000 3.625 2.000 3.500 2.000 2.000 3.500 2.875 2.000 3.625 3.625 5.125
##  [529] 3.000 6.125 2.875 3.625 3.000 2.000 5.125 3.500 3.000 3.000 2.875 1.000
##  [541] 3.625 3.500 3.500 3.500 2.875 2.875 3.000 3.500 3.500 3.500 5.125 2.000
##  [553] 3.625 2.875 3.500 3.500 3.500 3.500 3.000 2.000 2.000 1.000 3.500 3.625
##  [565] 2.875 2.875 5.125 5.125 3.000 1.000 3.000 1.000 3.000 2.875 2.875 2.875
##  [577] 6.125 3.500 3.500 3.000 3.500 3.000 3.500 1.750 3.625 2.000 2.000 3.000
##  [589] 3.625 2.000 2.875 3.500 3.500 3.625 3.625 3.500 2.875 3.000 3.000 6.125
##  [601] 3.000 2.875 1.750 5.125 2.875 3.625 3.625 3.000 1.000 3.500 3.000 3.500
##  [613] 2.875 3.625 3.500 2.000 2.875 1.750 5.125 3.625 6.125 3.000 1.750 2.875
##  [625] 3.500 1.750 3.500 3.500 3.000 3.500 3.625 3.500 3.000 3.500 2.875 1.750
##  [637] 1.000 2.875 1.000 3.625 5.125 5.125 2.000 3.500 3.625 1.750 2.000 3.500
##  [649] 3.500 2.000 2.875 2.875 2.875 2.875 3.500 3.500 3.000 2.875 2.875 3.500
##  [661] 1.750 2.875 5.125 3.625 6.125 2.875 3.500 2.875 3.000 2.875 2.875 3.625
##  [673] 3.000 5.125 3.500 5.125 2.000 2.875 3.000 2.000 2.000 3.000 1.750 2.875
##  [685] 3.000 3.625 3.625 3.000 3.500 3.500 2.875 3.000 1.750 3.000 3.625 2.875
##  [697] 2.875 2.875 3.000 3.000 3.000 3.625 3.000 5.125 2.875 3.500 2.875 6.125
##  [709] 1.750 2.000 2.875 5.125 3.500 2.000 2.875 3.000 5.125 3.000 2.875 3.500
##  [721] 3.625 3.500 3.500 2.000 2.875 3.000 3.000 3.625 2.875 5.125 3.000 3.625
##  [733] 2.000 2.875 7.000 3.625 2.000 2.000 2.875 3.000 3.625 2.000 2.000 6.125
##  [745] 2.000 1.750 2.875 3.500 2.000 6.125 2.000 3.000 2.875 5.125 2.000 2.000
##  [757] 3.000 1.750 3.500 2.000 3.000 1.750 2.000 5.125 3.000 1.750 5.125 3.500
##  [769] 3.000 5.125 3.625 5.125 2.875 2.875 0.750 2.000 3.000 3.625 2.000 2.875
##  [781] 3.000 3.500 2.000 3.625 1.750 3.625 3.000 6.125 3.000 3.500 3.000 1.750
##  [793] 5.125 2.875 3.500 3.500 2.875 3.500 1.750 2.000 3.625 3.000 3.000 3.625
##  [805] 2.875 2.000 3.000 6.125 2.875 3.000 3.625 2.875 1.750 2.000 2.875 3.625
##  [817] 3.625 3.500 3.500 1.000 2.875 2.875 3.000 3.000 1.750 2.875 3.000 2.000
##  [829] 3.500 3.625 5.125 1.750 3.500 2.875 3.625 1.000 3.000 1.750 2.875 3.625
##  [841] 2.875 2.875 3.500 5.125 5.125 3.000 2.875 2.875 3.500 2.000 2.000 3.625
##  [853] 2.875 2.000 2.000 3.000 3.000 3.500 3.625 2.000 3.000 2.000 3.000 2.875
##  [865] 1.000 5.125 2.875 2.000 3.000 3.500 3.000 2.875 3.000 5.125 3.500 1.000
##  [877] 3.500 1.750 3.000 2.875 5.125 3.625 2.875 3.500 3.000 5.125 3.500 2.875
##  [889] 2.000 3.625 2.875 3.500 3.500 2.875 3.000 3.000 2.000 3.500 2.875 3.000
##  [901] 3.000 3.000 2.875 2.875 2.875 3.625 3.000 1.000 3.000 2.875 2.875 3.625
##  [913] 1.750 3.625 2.875 2.875 1.750 3.500 3.000 3.000 2.000 3.000 3.000 3.000
##  [925] 3.000 6.125 3.000 2.000 3.500 3.000 2.000 5.125 6.125 3.000 6.125 3.000
##  [937] 3.500 3.625 3.500 3.625 3.625 3.500 3.625 1.750 2.000 3.000 2.875 5.125
##  [949] 2.000 2.875 2.000 3.500 2.000 3.500 1.750 2.000 3.500 3.000 3.000 2.000
##  [961] 2.000 3.000 1.000 3.000 3.500 2.875 1.750 2.875 2.875 1.000 2.875 3.625
##  [973] 3.500 3.000 5.125 5.125 3.625 2.875 2.875 2.000 3.500 2.875 3.500 2.875
##  [985] 3.500 3.625 3.000 5.125 3.500 5.125 2.000 3.000 3.500 3.500 2.875 3.000
##  [997] 2.000 2.875 2.000 2.875
## 
## $func.thetastar
## NULL
## 
## $jack.boot.val
## NULL
## 
## $jack.boot.se
## NULL
## 
## $call
## bootstrap(x = ZeaMays$diff, nboot = B, theta = median)

Laplace distribution

Probability density function :

\[f(x\mid \mu ,b)={\frac {1}{2b}}\exp \left(-{\frac {|x-\mu |}{b}}\right)\,\!\]

Laplace is a function of normal and exponential random variables:

\[X=\sqrt{W}\cdot Z, \quad W\sim Exp(1), \quad Z\sim N(0,1)\] The Laplace distribution with mean \(\mu \in \mathbb{R}\) and scale \(b > 0\) has the probability density function \[f_X(x)=\frac{1}{2b}\exp\left(-\frac{\left|x\right|}{b}\right)\] Note that if we can show that the Laplace distribution with mean \(0\) is a mixture of normals, then by shifting all these normals by \(\mu\), it follows that the Laplace distribution with mean \(\mu\) is also a mixture of normals.

Fix \(b > 0\). Let \(W \sim \text{Exp}(\frac{1}{2b^2})\), i.e. \[f_W(w) = \dfrac{1}{2b^2} \exp \left(-\dfrac{x}{2b^2} \right)\] for \(w \geq 0\), and let \(X \mid W = w \sim \mathcal{N}(0, w)\). We claim that \(X\) has the Laplace distribution with mean \(0\) and scale \(b\). This is equivalent to showing that for any \(x\), \[\begin{align} f_X(x)=\frac{1}{2b}\exp\left(-\frac{\left|x\right|}{b}\right) \iff \int_{0}^{\infty} f_{X\mid W=w}(x)f_{W}(w)dw = \frac{1}{2b}\exp\left(-\frac{\left|x\right|}{b}\right) \end{align}\] \[\begin{align} \int_{0}^{\infty} f_{X\mid W=w}(x)f_{W}(w)dw &= \int_{0}^{\infty} \frac{1}{\sqrt{2\pi w}}\exp\left(-\frac{x^2}{2w}\right)\frac{1}{2b^2}\exp\left(-\frac{w}{2b^2}\right)dw\\ &= \frac{1}{2b^2}\int_{0}^{\infty}\frac{1}{\sqrt{2\pi w}}\exp\left(-\frac{w^2+x^2b^2}{2b^2w}\right)dw\\ &= \frac{1}{2b^2}\int_{0}^{\infty}\frac{1}{\sqrt{2\pi w}}\exp\left(-\frac{(w-|x|b)^2}{2b^2w}-\frac{2w|x|b}{2b^2w}\right)dw\\ &= \frac{1}{2b^2}e^{-|x|/b}\int_{0}^{\infty}\frac{b}{\sqrt{2\pi b^2 w}}\exp\left(-\frac{(w-|x|b)^2}{2b^2w}\right)dw\\ &=\frac{1}{2b}\exp\left(-\frac{\left|x\right|}{b}\right) \end{align}\]

The Mean of \[\frac{1}{2b}\exp\left(-\frac{\left|x-\mu\right|}{b}\right)\] is \(\mu\) and Variance is \(2b^2\).

The Laplace distribution is an example of where the consideration of the generative process indicates how the variance and mean are linked. The expectation value and variance of an asymmetric Laplace distribution \(AL(\theta, \mu, \sigma)\) are \[\mathbb E(X)=\theta+ \mu\] and \[\text{var} (X)=\mu^2 +\sigma^2\] Note the variance is dependent on the mean, unless (the case of the symmetric Laplace Distribution). This is the feature of the distribution that makes it useful. Having a mean-variance dependence is very common for physical measurements, be they microarray fluorescence intensities, peak heights from a mass spectrometer, or reads counts from high-throughput sequencing, as we’ll see in the next section.

## -----------------------------------------------------------------------------
c(N3 = choose(5, 3), N15 = choose(29, 15))
##       N3      N15 
##       10 77558760
## -----------------------------------------------------------------------------
w = rexp(10000, rate = 1)

## -----------------------------------------------------------------------------
mu  = 0.3
lps = rnorm(length(w), mean = mu, sd = sqrt(w))
ggplot(data.frame(lps), aes(x = lps)) +
  geom_histogram(fill = "purple", binwidth = 0.1)

## -----------------------------------------------------------------------------
mu = 0.3; sigma = 0.4; theta = -1
w  = rexp(10000, 1)
alps = rnorm(length(w), theta + mu * w, sigma * sqrt(w))
ggplot(tibble(alps), aes(x = alps)) +
  geom_histogram(fill = "purple", binwidth = 0.1)

## -----------------------------------------------------------------------------
ggplot(tibble(x = rgamma(10000, shape = 2, rate = 1/3)),
   aes(x = x)) + geom_histogram(bins = 100, fill= "purple")

ggplot(tibble(x = rgamma(10000, shape = 10, rate = 3/2)),
   aes(x = x)) + geom_histogram(bins = 100, fill= "purple")

## -----------------------------------------------------------------------------
lambda = rgamma(10000, shape = 10, rate = 3/2)
gp = rpois(length(lambda), lambda = lambda)
ggplot(tibble(x = gp), aes(x = x)) +
  geom_histogram(bins = 100, fill= "purple")

## -----------------------------------------------------------------------------
library("vcd")
ofit = goodfit(gp, "nbinomial")
plot(ofit, xlab = "")

ofit$par
## $size
## [1] 10.23463
## 
## $prob
## [1] 0.6037692
## -----------------------------------------------------------------------------
x    = 0:95
mu   = 50
vtot = 80
v1   = vtot - mu
scale = v1/mu    # 0.6
shape = mu^2/v1  # 83.3
p1   = dgamma(x = x, scale = 0.6, shape = 80)
p2   = dpois(x = x, lambda = mu*1.2)
p3   = dnbinom(x = x, mu = mu, size = mu^2/vtot)
## -----------------------------------------------------------------------------
library("RColorBrewer")
cols = brewer.pal(8, "Paired")
par(mfrow=c(3,1), mai=c(0.5, 0.5, 0.01, 0.01))
xlim = x[c(1, length(x))]
plot(NA, NA, xlim=xlim, ylim=c(0,0.07), type="n", ylab="", xlab="")
polygon(x, p1, col=cols[1])
abline(v=mu, col="black", lwd=3)
abline(v=mu*1.2, col=cols[2], lty=2, lwd=3)
plot(x, p2, col=cols[3], xlim=xlim, ylab="", xlab="", type="h", lwd=2)
abline(v=mu*1.2, col=cols[2], lwd=2)
abline(v=mu*1.1, col=cols[4], lty=2, lwd=3)
plot(x, p3, col=cols[4], xlim=xlim, type="h", lwd=2, ylab="", xlab="")

## -----------------------------------------------------------------------------
lambdas = seq(100, 900, by = 100)
simdat = lapply(lambdas, function(l)
    tibble(y = rpois(n = 40, lambda=l), lambda = l)
  ) %>% bind_rows
simdat
## # A tibble: 360 × 2
##        y lambda
##    <int>  <dbl>
##  1   105    100
##  2   102    100
##  3    97    100
##  4    94    100
##  5   100    100
##  6   100    100
##  7   102    100
##  8    97    100
##  9    90    100
## 10   107    100
## # ℹ 350 more rows
library("ggbeeswarm")
ggplot(simdat, aes(x = lambda, y = y)) +
  geom_beeswarm(alpha = 0.6, color = "purple")

ggplot(simdat, aes(x = lambda, y = sqrt(y))) +
  geom_beeswarm(alpha = 0.6, color = "purple")

Expected Value of Square Root of Poisson Random Variable

In general, for smooth \(g(X)\) you can do a Taylor expansion around the mean \(\mu=E(X)\) :

\[g(X)=g(\mu)+g′(\mu)(X−\mu)+\frac{g′′(\mu)}{2!}(X−\mu)^2+\frac{g′′′(\mu)}{3!}(X−\mu)^3+⋯\] So

\[E[g(X)]=g(\mu)+\frac{g′′(\mu)}{2!}m_2+\frac{g′′′(\mu)}{3!}m_3+⋯\] where \(m_i\) is the \(i^{th}\) centered moment. In our case \(m_2=m_3=\lambda\), so:

\[E[g(X)]=\sqrt{\lambda}−\frac{\lambda^{−1/2}}{8}+\frac{\lambda^{−3/2}}{16}+⋯\] This approximation is useful only if \(\lambda \gg 1\)

The mean and variance of any Poisson process is given as

\[E[P(\lambda_i)] = Var[P(\lambda_i)] = \lambda_i\]

Square root transformation of Poisson process

\[\operatorname{var}(\sqrt{P(\lambda)})=E[P(\lambda)]-E[\sqrt{P(\lambda)}]^2=\lambda-E[\sqrt{P(\lambda)}]^2\]

The second term can be approximated better as follows:

\[E[\sqrt{P(\lambda)}]\approx \sqrt{\lambda}-\frac{\lambda^{-1/2}}{8}+\frac{\lambda^{-3/2}}{16}+...\]

Which is why the expected value is also approximated by \(\sqrt \lambda\). Square of it will be \[E[\sqrt{P(\lambda)}]^2\approx \lambda -\frac{1}{4}+\frac{9}{64\lambda}+...\]

So, the variance will be approximately \[\operatorname{var}(\sqrt{P(\lambda)})\approx \frac{1}{4}-\frac{9}{64\lambda}+...\]

which is approximately \(1/4\) for large \(\lambda\), and the standard deviation is approximately \(1/2\).

## -----------------------------------------------------------------------------
.o = options(digits = 3)
summarise(group_by(simdat, lambda), sd(y), sd(2*sqrt(y)))
## # A tibble: 9 × 3
##   lambda `sd(y)` `sd(2 * sqrt(y))`
##    <dbl>   <dbl>             <dbl>
## 1    100    10.3             1.03 
## 2    200    14.7             1.05 
## 3    300    17.9             1.04 
## 4    400    19.2             0.959
## 5    500    23.1             1.03 
## 6    600    25.5             1.05 
## 7    700    26.8             1.02 
## 8    800    26.7             0.945
## 9    900    37.0             1.24
options(.o)
## -----------------------------------------------------------------------------
muvalues = 2^seq(0, 10, by = 1)
simgp = lapply(muvalues, function(mu) {
  u = rnbinom(n = 1e4, mu = mu, size = 4) # size:   target for number of successful trials, or dispersion parameter (the shape parameter of the gamma mixing distribution). Must be strictly positive, need not be integer.
  tibble(mean = mean(u), sd = sd(u),
         lower = quantile(u, 0.025),
         upper = quantile(u, 0.975),
         mu = mu)
  } ) %>% bind_rows
simgp
## # A tibble: 11 × 5
##       mean     sd lower upper    mu
##      <dbl>  <dbl> <dbl> <dbl> <dbl>
##  1    1.00   1.11     0    4      1
##  2    1.98   1.72     0    6      2
##  3    4.03   2.86     0   11      4
##  4    8.03   4.93     1   20      8
##  5   16.1    8.96     3   38     16
##  6   32.0   16.7      8   71     32
##  7   64.1   33.2     16  143     64
##  8  127.    64.9     32  282    128
##  9  257.   129.      69  556    256
## 10  507.   252.     133 1094.   512
## 11 1020.   508.     272 2239.  1024
head(as.data.frame(simgp), 2)
##     mean       sd lower upper mu
## 1 1.0019 1.110234     0     4  1
## 2 1.9824 1.723133     0     6  2
ggplot(simgp, aes(x = mu, y = mean, ymin = lower, ymax = upper)) +
  geom_point() + geom_errorbar()

## -----------------------------------------------------------------------------
simgp = mutate(simgp,
  slopes = 1 / sd,
  trsf   = cumsum(slopes * mean))
ggplot(simgp, aes(x = mean, y = trsf)) +
  geom_point() + geom_line() + xlab("")

Call our transformation function \(g\), and assume it’s differentiable. Also call our random variables \(X_i\) , with means \(\mu_i\) and variances \(v_i\) , and we assume that \(\mu_i\) and \(v_i\) are related by a functional relationship \(v_i = \nu(\mu_i)\) . Then, for values of \(X_i\) in the neighborhood of its mean \(\mu_i\), \[g(X_i)=g(\mu_i)+g'(\mu_i)(X_i-\mu_i)+\cdots\]

where the dots stand for higher order terms that we can neglect. The variances of the transformed values are then \[\text{Var }(g(X_i))\simeq g'(\mu_i)^2\] \[\text{Var }(X_i)\simeq g'(\mu_i)^2\nu(\mu_i)\]

where we have used the rules \(\text{Var}(X-c)=\text{Var}(X)\) and \(\text{Var}(cX)=c^2\text{Var}(X)\) that hold whenever \(c\) is a constant number. Requiring that this be constant leads to the differential equation \[g'(x)=\frac{1}{\sqrt{\nu(x)}}\] For a given mean-variance relationship \(\nu(\mu)\), we can solve this for the function \(g\) . Let’s check this for some simple cases:

  • if \(\nu(\mu)=\mu\)(Poisson), we recover \(g(x)=\sqrt{x}\), the square root transformation.
  • If \(\nu(\mu)=\alpha \mu^2\), solving the differential equation \[g'(x)=\frac{1}{\sqrt{\nu(x)}}\] gives \(g(x)=\log(x)\). This explains why the logarithm transformation is so popular in many data analysis applications: it acts as a variance stabilizing transformation whenever the data have a constant coefficient of variation, that is, when the standard deviation is proportional to the mean.
## -----------------------------------------------------------------------------
f = function(x, a) 
  ifelse (a==0, 
    sqrt(x), 
    log(2*sqrt(a) * sqrt(x*(a*x+1)) + 2*a*x+1) / (2*sqrt(a)))
x  = seq(0, 24, by = 0.1)
df = lapply(c(0, 0.05*2^(0:5)), function(a) 
  tibble(x = x, a = a, y = f(x, a))) %>% bind_rows()
ggplot(df, aes(x = x, y = y, col = factor(a))) + 
  geom_line() + labs(col = expression(alpha))

## -----------------------------------------------------------------------------
f2 = function(x, a) ifelse (a==0, sqrt(x), acosh(2*a*x + 1) / (2*sqrt(a)))  
with(df, max(abs(f2(x,a) - y)))
## [1] 8.881784e-16
stopifnot(with(df, max(abs(f2(x,a) - y))) < 1e-10)
## -----------------------------------------------------------------------------
a = c(0.2, 0.5, 1)
f(1e6, a) 
## [1] 15.196731 10.259171  7.600903
1/(2*sqrt(a)) * (log(1e6) + log(4*a))
## [1] 15.196728 10.259170  7.600902
## -----------------------------------------------------------------------------
setwd("D:/books/R/blogdown/hugo")
mx = readRDS("../data/Myst.rds")$yvar
str(mx)
##  num [1:1800] 0.3038 0.0596 -0.0204 0.1849 0.2842 ...
ggplot(tibble(mx), aes(x = mx)) + geom_histogram(binwidth = 0.025)

## -----------------------------------------------------------------------------
wA = runif(length(mx))
wB = 1 - wA

## -----------------------------------------------------------------------------
iter      = 0
loglik    = -Inf
delta     = +Inf
tolerance = 1e-3
miniter   = 50
maxiter   = 1000
## -----------------------------------------------------------------------------
while((delta > tolerance) && (iter <= maxiter) || (iter < miniter)) {
  lambda = mean(wA)
  muA = weighted.mean(mx, wA)
  muB = weighted.mean(mx, wB)
  sdA = sqrt(weighted.mean((mx - muA)^2, wA))
  sdB = sqrt(weighted.mean((mx - muB)^2, wB))

  pA   =    lambda    * dnorm(mx, mean = muA, sd = sdA)
  pB   = (1 - lambda) * dnorm(mx, mean = muB, sd = sdB)
  ptot = pA + pB
  wA   = pA / ptot
  wB   = pB / ptot

  loglikOld = loglik
  loglik = sum(log(pA)) + sum(log(pB))
  delta = abs(loglikOld - loglik)
  iter = iter + 1
}
iter
## [1] 315
## -----------------------------------------------------------------------------
.o = options(digits = 3)
c(lambda, muA, muB, sdA, sdB)
## [1]  0.5244  0.1473 -0.1694  0.1498  0.0983
options(.o)

## -----------------------------------------------------------------------------
.o = options(digits = 3)
gm = mixtools::normalmixEM(mx, k = 2)
## number of iterations= 276
with(gm, c(lambda[1], mu, sigma))
## [1]  0.4756 -0.1694  0.1473  0.0983  0.1498
options(.o)
## -----------------------------------------------------------------------------
library("flexmix")
## 
## Attaching package: 'flexmix'
## The following object is masked from 'package:boot':
## 
##     boot
## The following object is masked from 'package:Gviz':
## 
##     group
data("NPreg")
NPreg
##              x         yn yp yb class id1 id2
## 1   4.17663259 22.3803787  4  0     1   1   1
## 2   1.20163055  5.1115746  3  0     1   1   1
## 3   2.29500635 13.2510576  9  0     1   2   1
## 4   5.96586786 30.2852404  3  1     1   2   1
## 5   2.35808338 14.7645078  4  0     1   3   2
## 6   7.63706104 42.8337601  1  1     1   3   2
## 7   6.45246460 34.2667327  2  1     1   4   2
## 8   9.40163061 42.8138796  2  1     1   4   2
## 9   8.29927115 49.0659954  2  1     1   5   3
## 10  4.33053110 19.3005980  4  0     1   5   3
## 11  4.61978652 29.8703835  2  0     1   6   3
## 12  9.71513912 46.4716001  0  1     1   6   3
## 13  2.27872578  4.6923751  9  1     1   7   4
## 14  6.92897805 34.3449694  0  1     1   7   4
## 15  7.25327696 32.9045352  0  1     1   8   4
## 16  0.40434891  1.2872814  6  1     1   8   4
## 17  9.36491266 49.8382514  2  1     1   9   5
## 18  2.57714695 11.8085276  4  0     1   9   5
## 19  7.74066373 33.0757635  1  1     1  10   5
## 20  9.89474383 51.6069735  4  1     1  10   5
## 21  0.55744518  4.6421303 10  0     1  11   6
## 22  1.54234713  7.9628999 10  0     1  11   6
## 23  8.09633770 45.4036015  2  1     1  12   6
## 24  1.08087069  4.1806895  7  0     1  12   6
## 25  3.13088989 10.1874921  2  0     1  13   7
## 26  7.36439054 37.6754354  2  1     1  13   7
## 27  8.85874283 45.7929465  2  1     1  14   7
## 28  5.66849986 33.4717028  2  1     1  14   7
## 29  6.50576695 32.7460204  1  1     1  15   8
## 30  1.53762205  4.8097527  6  0     1  15   8
## 31  6.73142951 34.4618322  1  1     1  16   8
## 32  0.54238057  6.7040248  8  0     1  16   8
## 33  0.34155727  1.5408095  6  0     1  17   9
## 34  4.75640223 22.0913005  5  0     1  17   9
## 35  8.73919645 42.1726314  0  1     1  18   9
## 36  2.70205396 10.7961692  4  0     1  18   9
## 37  8.30704714 40.8369753  0  1     1  19  10
## 38  3.74932799 13.5857934  6  0     1  19  10
## 39  4.28106160 17.0062023  5  0     1  20  10
## 40  0.44036390  4.8016176  4  0     1  20  10
## 41  7.20383449 35.2574679  1  1     1  21  11
## 42  0.82404352  3.6779612  7  0     1  21  11
## 43  4.42757293 21.4447769  3  1     1  22  11
## 44  0.33236471  3.9858411  4  0     1  22  11
## 45  8.98413221 41.3208325  1  1     1  23  12
## 46  4.09455624 22.0601242  2  0     1  23  12
## 47  1.95838182  7.5923929  4  0     1  24  12
## 48  9.15891990 41.5530343  0  1     1  24  12
## 49  8.89296908 45.8451285  1  0     1  25  13
## 50  2.45296791  7.2185934  5  0     1  25  13
## 51  0.06295317 -0.7382478  4  0     1  26  13
## 52  2.85040085 11.5908587  3  0     1  26  13
## 53  7.96088626 36.2613458  3  1     1  27  14
## 54  5.77690206 28.5847337  3  1     1  27  14
## 55  4.13608936 25.3425513  3  0     1  28  14
## 56  0.42128309 -1.4128968  7  0     1  28  14
## 57  9.84276558 51.1781358  1  1     1  29  15
## 58  0.20367758  3.0164145  8  0     1  29  15
## 59  1.22520362 10.0824988  3  0     1  30  15
## 60  7.71576158 40.6650416  2  1     1  30  15
## 61  5.68957164 33.5874400  1  1     1  31  16
## 62  1.64931152  7.6441527  8  1     1  31  16
## 63  6.21120097 31.0314027  2  0     1  32  16
## 64  1.26452997 -1.4815158  5  0     1  32  16
## 65  6.92773212 30.1226659  2  0     1  33  17
## 66  5.03979465 24.0594062  0  1     1  33  17
## 67  7.31309950 44.6842204  0  1     1  34  17
## 68  4.63261999 27.0774981  4  0     1  34  17
## 69  0.51121986  5.3782324  4  0     1  35  18
## 70  1.07330848  5.5521353  4  0     1  35  18
## 71  7.96409829 34.2089461  2  1     1  36  18
## 72  1.99945993  5.6326370  3  0     1  36  18
## 73  1.87378620  6.1134515  7  0     1  37  19
## 74  4.74580563 19.8444540  2  0     1  37  19
## 75  5.76907872 26.7631365  3  1     1  38  19
## 76  0.52463379  4.3079712  2  0     1  38  19
## 77  3.46584146 14.5557600  6  0     1  39  20
## 78  7.29252838 39.4454486  1  1     1  39  20
## 79  5.40030459 26.7301592  3  0     1  40  20
## 80  2.79238097 14.0218112  8  0     1  40  20
## 81  3.90283673 15.9778277  2  0     1  41  21
## 82  5.04182776 26.2447923  2  0     1  41  21
## 83  6.42973634 30.8354474  1  1     1  42  21
## 84  7.48850271 35.4915284  5  1     1  42  21
## 85  8.63020112 48.2392871  2  1     1  43  22
## 86  7.15776147 39.7617173  3  1     1  43  22
## 87  8.97324030 43.3963793  0  1     1  44  22
## 88  2.60951572 13.8106373  3  0     1  44  22
## 89  2.96134369 12.6836033  5  1     1  45  23
## 90  8.83580907 46.3313585  1  1     1  45  23
## 91  6.14941012 23.8764264  4  1     1  46  23
## 92  1.95363952 10.5716207  4  0     1  46  23
## 93  4.39655041 18.5927401  3  0     1  47  24
## 94  3.13603434 15.2807182  4  0     1  47  24
## 95  5.99525632 29.7251730  1  1     1  48  24
## 96  8.79126861 53.4866896  2  1     1  48  24
## 97  9.68095973 49.4019432  2  1     1  49  25
## 98  7.40750214 37.4921016  2  1     1  49  25
## 99  4.11756789 26.1089606  2  1     1  50  25
## 100 8.77639766 43.8204884  0  1     1  50  25
## 101 8.39039786 30.0024180  9  0     2  51  26
## 102 2.11470492 30.7784942  3  1     2  51  26
## 103 9.04111851 25.4453871  4  0     2  52  26
## 104 3.39129320 37.8679635  2  1     2  52  26
## 105 9.04211228 26.6354192  6  0     2  53  27
## 106 3.99784800 43.6441463  5  1     2  53  27
## 107 6.35936341 40.1565408  6  0     2  54  27
## 108 7.99747483 26.8208712  7  0     2  54  27
## 109 4.40447242 47.2149865  6  0     2  55  28
## 110 8.03273979 28.4504319  6  0     2  55  28
## 111 9.03479311 30.4918954  5  0     2  56  28
## 112 3.21633814 34.7144548  4  1     2  56  28
## 113 1.16274254 18.5742013  2  1     2  57  29
## 114 3.38076958 37.0781720  2  1     2  57  29
## 115 1.25251094 22.5944761  3  1     2  58  29
## 116 5.87154603 38.5059444  4  0     2  58  29
## 117 1.77085263 32.5862954  3  0     2  59  30
## 118 3.43636802 36.4778478  4  1     2  59  30
## 119 7.16614059 29.6802797  3  0     2  60  30
## 120 3.76918128 40.6183397  5  1     2  60  30
## 121 1.52081588 29.7501823  1  1     2  61  31
## 122 5.37764869 40.1085457  7  0     2  61  31
## 123 0.68309405 26.2862360  2  1     2  62  31
## 124 5.80866236 38.1224013  6  0     2  62  31
## 125 5.32623267 34.4266149  4  1     2  63  32
## 126 1.55523806 28.9870979  4  1     2  63  32
## 127 1.77223938 31.0807937  3  1     2  64  32
## 128 9.47912345 25.0666566  7  0     2  64  32
## 129 7.34257603 34.7295232  3  0     2  65  33
## 130 7.56841214 30.5249015 12  0     2  65  33
## 131 6.38570772 38.8844988  4  0     2  66  33
## 132 3.23916002 40.8915645  6  1     2  66  33
## 133 1.55249906 27.9477605  3  1     2  67  34
## 134 0.44023583 17.5178401  3  1     2  67  34
## 135 5.10267347 38.4661073  4  1     2  68  34
## 136 6.25680008 35.7063530  5  1     2  68  34
## 137 3.03429356 35.4377378  7  1     2  69  35
## 138 5.58708904 34.4944798  8  0     2  69  35
## 139 6.21969731 34.1132328  3  0     2  70  35
## 140 9.35484102 23.6351578  4  0     2  70  35
## 141 9.51248483 18.8757761  8  0     2  71  36
## 142 6.13313907 38.2737395  8  0     2  71  36
## 143 9.33058083 20.5529819  9  0     2  72  36
## 144 0.99726202 26.3021062  4  1     2  72  36
## 145 5.05373145 36.3972843  3  0     2  73  37
## 146 3.98479898 40.5567099  4  0     2  73  37
## 147 1.77266492 27.3847921  3  1     2  74  37
## 148 2.24338040 28.1594832  2  1     2  74  37
## 149 3.46701554 39.0302418  6  1     2  75  38
## 150 4.06068979 34.0714501  4  1     2  75  38
## 151 5.82560360 38.2653651  4  1     2  76  38
## 152 8.81287538 22.8008357  6  0     2  76  38
## 153 4.57472311 36.2760541  4  1     2  77  39
## 154 8.24189374 29.1903484  3  0     2  77  39
## 155 4.76454221 44.6066641  5  1     2  78  39
## 156 7.98348782 27.5794882  2  0     2  78  39
## 157 5.24510010 41.9042338  2  0     2  79  40
## 158 6.69475207 39.1258420  4  0     2  79  40
## 159 2.70818324 38.7040567  9  1     2  80  40
## 160 7.52318812 35.7197554  5  0     2  80  40
## 161 3.69377164 43.4333492  6  1     2  81  41
## 162 4.69006520 39.3015355  0  1     2  81  41
## 163 0.60398059 20.6504112  3  1     2  82  41
## 164 0.43911146 11.3941301  3  1     2  82  41
## 165 5.81689226 34.8166924  5  1     2  83  42
## 166 6.09789830 37.6550523  7  0     2  83  42
## 167 9.81102144 24.9727956 11  0     2  84  42
## 168 3.13948817 40.4528939  2  1     2  84  42
## 169 9.74593014 20.2982802  4  0     2  85  43
## 170 6.17080719 38.8148035  7  0     2  85  43
## 171 8.92470426 18.9851511 10  0     2  86  43
## 172 6.44711653 33.5411911  6  0     2  86  43
## 173 3.63741036 34.8878700  2  1     2  87  44
## 174 7.36664722 30.5144068  5  0     2  87  44
## 175 6.63565478 35.2423764  8  0     2  88  44
## 176 5.79653555 41.0503333  1  0     2  88  44
## 177 3.20066856 33.9889591  7  1     2  89  45
## 178 8.30276330 32.0745613  4  0     2  89  45
## 179 2.59769300 33.9575573  2  1     2  90  45
## 180 7.17121302 35.3457404  3  0     2  90  45
## 181 3.66614757 34.6844817  3  1     2  91  46
## 182 2.94551030 36.8147255  3  1     2  91  46
## 183 7.63484824 31.7443404  5  0     2  92  46
## 184 8.38360701 26.6002184  6  0     2  92  46
## 185 7.95656429 36.3470091  3  0     2  93  47
## 186 7.21477790 39.0676688  4  0     2  93  47
## 187 1.03266907 22.7904631  3  1     2  94  47
## 188 6.26142937 39.1718546  9  0     2  94  47
## 189 3.75780662 36.3338405  5  1     2  95  48
## 190 5.93019124 41.2870574  9  0     2  95  48
## 191 1.27990952 19.2903026  3  1     2  96  48
## 192 8.97071640 25.0368343  5  0     2  96  48
## 193 1.64046864 25.3235371  2  0     2  97  49
## 194 5.27091961 39.5271491  8  1     2  97  49
## 195 3.24612225 36.6728042  4  0     2  98  49
## 196 3.67903721 47.7854039  4  0     2  98  49
## 197 0.73162182 22.7780922  2  1     2  99  50
## 198 6.69055712 37.5966075  4  0     2  99  50
## 199 4.08498638 44.6838712  1  0     2 100  50
## 200 9.18692898 22.4081259  7  0     2 100  50
## -----------------------------------------------------------------------------
m1 = flexmix(yn ~ x + I(x^2), data = NPreg, k = 2)

## -----------------------------------------------------------------------------
ggplot(NPreg, aes(x = x, y = yn)) + geom_point()

## -----------------------------------------------------------------------------
modeltools::parameters(m1, component = 1)
##                       Comp.1
## coef.(Intercept) -0.20801986
## coef.x            4.81563289
## coef.I(x^2)       0.03636977
## sigma             3.47480999
modeltools::parameters(m1, component = 2)
##                      Comp.2
## coef.(Intercept) 14.7181608
## coef.x            9.8452394
## coef.I(x^2)      -0.9682084
## sigma             3.4808821
modeltools::clusters(m1)
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2
## [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 2 2 2 1
## [186] 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## -----------------------------------------------------------------------------
table(NPreg$class, modeltools::clusters(m1))
##    
##      1  2
##   1 95  5
##   2  5 95
## -----------------------------------------------------------------------------
summary(m1)
## 
## Call:
## flexmix(formula = yn ~ x + I(x^2), data = NPreg, k = 2)
## 
##        prior size post>0 ratio
## Comp.1 0.494  100    145 0.690
## Comp.2 0.506  100    141 0.709
## 
## 'log Lik.' -642.5451 (df=9)
## AIC: 1303.09   BIC: 1332.775
## -----------------------------------------------------------------------------
NPreg = mutate(NPreg, gr = factor(class))
ggplot(NPreg, aes(x = x, y = yn, group = gr)) +
   geom_point(aes(colour = gr, shape = gr)) +
   scale_colour_hue(l = 40, c = 180)