# read input data
data1 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\locustSerotonin.csv", header = TRUE)

head(data1) #show the first part of the table
#data1 

# 3 treatments with 10 samples per treatment
table(data1$treatmentTime)

nrow(data1) #count number of rows
paste("number of rows = ", nrow(data1))

ncol(data1) #count number of columns

 0  1  2 
10 10 10


#basic descriptive statistic of the data1
summary(data1)

#?summary

 serotoninLevel   treatmentTime
 Min.   : 3.200   Min.   :0    
 1st Qu.: 4.675   1st Qu.:0    
 Median : 5.900   Median :1    
 Mean   : 8.407   Mean   :1    
 3rd Qu.:11.475   3rd Qu.:2    
 Max.   :21.300   Max.   :2


# plot a strip chart
stripchart(serotoninLevel ~ treatmentTime, data = data1, method = "jitter", 
           vertical = TRUE, col = c("red", "blue", "black"), pch = c(11, 12, 13))


# another plotting with the plot() function
plot(data1$serotoninLevel ~ data1$treatmentTime)


#install.packages(ggplot2)
library(ggplot2)
ggplot(data1, aes(serotoninLevel, treatmentTime)) +
  geom_point(data = data1, aes(y = serotoninLevel, treatmentTime), colour = 'red', size = 3)


meanSerotonin <- tapply(data1$serotoninLevel, data1$treatmentTime, mean)
sdSerotonin <- tapply(data1$serotoninLevel, data1$treatmentTime, sd)
nSerotonin <- tapply(data1$serotoninLevel, data1$treatmentTime, length)
seSerotonin <- sdSerotonin / sqrt(nSerotonin)
meanSerotonin

# select hormone level at time = 0
t0 = data1$serotoninLevel[data1$treatmentTime == 0]
paste("variance at time 0 =", round(var(t0), digits=2))

t1 = data1$serotoninLevel[data1$treatmentTime == 1]
paste("variance at time 1 =", round(var(t1), digits=2))

t2 = data1$serotoninLevel[data1$treatmentTime == 2]
paste("variance at time 2 =", round(var(t2), digits=2))

bartlett.test(data1$serotoninLevel, data1$treatmentTime)

	Bartlett test of homogeneity of variances

data:  data1$serotoninLevel and data1$treatmentTime
Bartlett's K-squared = 0.092008, df = 2, p-value = 0.955


stripchart(serotoninLevel ~ treatmentTime, data = data1, method = "jitter", vertical = TRUE)
#Add error bars to the chart
offsetAmount <- 0.2
segments( c(c(1,2,3) + offsetAmount), meanSerotonin - seSerotonin, 
	  c(c(1,2,3) + offsetAmount), meanSerotonin + seSerotonin)
points(meanSerotonin ~ c(c(1,2,3) + offsetAmount), pch = 16, cex = 1.2)


data1.aov = aov(serotoninLevel ~ treatmentTime, data = data1)
summary(data1.aov)

              Df Sum Sq Mean Sq F value Pr(>F)  
treatmentTime  1   99.5   99.46   4.046  0.054 .
Residuals     28  688.3   24.58                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


# compare means of two samples
t.test(t0, t1)

t.test(t0, t2)

t.test(t1, t2)

	Welch Two Sample t-test

data:  t0 and t1
t = -0.76785, df = 17.984, p-value = 0.4525
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -6.276926  2.916926
sample estimates:
mean of x mean of y 
     6.36      8.04

	Welch Two Sample t-test

data:  t0 and t2
t = -1.9631, df = 17.822, p-value = 0.06543
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -9.2365083  0.3165083
sample estimates:
mean of x mean of y 
     6.36     10.82

	Welch Two Sample t-test

data:  t1 and t2
t = -1.2073, df = 17.911, p-value = 0.243
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -7.619431  2.059431
sample estimates:
mean of x mean of y 
     8.04     10.82


#read input data
data2 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\DeathsFromTigers.csv", header = TRUE)

head(data2)


#generate frequency table
tigerTable <- sort(table(data2$activity), decreasing = TRUE)
tigerTable

#make a table format
data.frame(Frequency = addmargins(tigerTable))
tigerTable2 = data.frame(Frequency = addmargins(tigerTable))

         Grass/fodder       Forest products               Fishing 
                   44                    11                     8 
              Herding Disturbing tiger kill       Fuelwood/timber 
                    7                     5                     5 
    Sleeping in house               Walking                Toilet 
                    3                     3                     2


# barplot
barplot(tigerTable, ylab = "Frequency", cex.names = 0.5, las = 2)


data3 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\SalmonBodySize.csv", header = TRUE)

head(data3)
tail(data3) #show the end of the table
summary(data3)

      year         sex          oceanAgeYears      lengthMm         massKg     
 Min.   :1996   Mode :logical   Min.   :2.000   Min.   :389.0   Min.   :1.180  
 1st Qu.:1996   FALSE:228       1st Qu.:2.000   1st Qu.:427.0   1st Qu.:1.641  
 Median :1996                   Median :2.000   Median :447.0   Median :1.855  
 Mean   :1996                   Mean   :2.241   Mean   :450.4   Mean   :2.028  
 3rd Qu.:1996                   3rd Qu.:2.000   3rd Qu.:459.8   3rd Qu.:2.266  
 Max.   :1996                   Max.   :3.000   Max.   :550.0   Max.   :3.528


# plot histograms
hist(data3$massKg, right = FALSE, breaks = seq(1,4,by=0.1), col = "firebrick")
hist(data3$massKg, right = FALSE, breaks = seq(1,4,by=0.3), col = "firebrick")
hist(data3$massKg, right = FALSE, breaks = seq(1,4,by=0.5), col = "firebrick")


#correlation
cor(data3$massKg, data3$lengthMm)

#correlation test
cor.test(data3$massKg, data3$lengthMm)

	Pearson's product-moment correlation

data:  data3$massKg and data3$lengthMm
t = 50.423, df = 226, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.9461990 0.9677459
sample estimates:
      cor 
0.9583138


age2 = data3[data3$oceanAgeYears == 2, ]
age3 = data3[data3$oceanAgeYears == 3, ]

head(age2)
head(age3)

cor(age2$massKg, age2$lengthMm)
cor(age3$massKg, age3$lengthMm)

cor.test(age2$massKg, age2$lengthMm)
cor.test(age3$massKg, age3$lengthMm)

	Pearson's product-moment correlation

data:  age2$massKg and age2$lengthMm
t = 21.961, df = 171, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.8144144 0.8938276
sample estimates:
      cor 
0.8592112

	Pearson's product-moment correlation

data:  age3$massKg and age3$lengthMm
t = 8.6003, df = 53, p-value = 1.245e-11
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6243922 0.8553533
sample estimates:
      cor 
0.7632564


scatter.smooth(age2$massKg, age2$lengthMm)
scatter.smooth(age3$massKg, age3$lengthMm)


# independent t-test
t.test(age2$massKg, age3$massKg)

	Welch Two Sample t-test

data:  age2$massKg and age3$massKg
t = -24.659, df = 73.687, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.203761 -1.023754
sample estimates:
mean of x mean of y 
 1.759497  2.873255


t.test(age2$lengthMm, age3$lengthMm)

	Welch Two Sample t-test

data:  age2$lengthMm and age3$lengthMm
t = -24.787, df = 84.802, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -72.87186 -62.04901
sample estimates:
mean of x mean of y 
 434.1214  501.5818


#Build linear model y= ax + b
linearModSalmon <- lm(massKg ~ lengthMm, data=data3)  
print(linearModSalmon)

Call:
lm(formula = massKg ~ lengthMm, data = data3)

Coefficients:
(Intercept)     lengthMm  
   -4.92825      0.01545


data4 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\BirdMalaria.csv", header = TRUE)
head(data4)


#convert treatment to factor
data4$treatment <- factor(data4$treatment, levels= c("Egg removal", "Control"))

# make a contingency table
birdMalariaTable <- table(data4$response, data4$treatment)
birdMalariaTable

#add row and column summation
addmargins(birdMalariaTable, margin = c(1,2), FUN = sum, quiet = TRUE)

            
             Egg removal Control
  Malaria             15       7
  No Malaria          15      28


#barplot
barplot(as.matrix(birdMalariaTable), beside = TRUE)


#Test of Independence (Chi-Square Test) 
#Test independence of the row and column variable
chisq.test(birdMalariaTable)

# for small number of the expected frequencies
fisher.test(birdMalariaTable)

	Pearson's Chi-squared test with Yates' continuity correction

data:  birdMalariaTable
X-squared = 5.2224, df = 1, p-value = 0.0223

	Fisher's Exact Test for Count Data

data:  birdMalariaTable
p-value = 0.01739
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
  1.188903 14.074179
sample estimates:
odds ratio 
  3.909422


data5 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\GuppyFatherSonAttractiveness.csv", header = TRUE)
head(data5)


#scatter plot
plot(sonAttractiveness ~ fatherOrnamentation, data = data5)


#Correlations
cor(data5$sonAttractiveness, data5$fatherOrnamentation)

cor.test(data5$sonAttractiveness, data5$fatherOrnamentation)

	Pearson's product-moment correlation

data:  data5$sonAttractiveness and data5$fatherOrnamentation
t = 4.5371, df = 34, p-value = 6.784e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.3577455 0.7843860
sample estimates:
      cor 
0.6141043


#Build linear model y= ax + b
linearModGuppy <- lm(sonAttractiveness ~ fatherOrnamentation, data=data5)  
print(linearModGuppy)

Call:
lm(formula = sonAttractiveness ~ fatherOrnamentation, data = data5)

Coefficients:
        (Intercept)  fatherOrnamentation  
           0.005084             0.982285


data6 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\HumanHemoglobinElevation.csv", header = TRUE)
head(data6)


#number of samples
table(data6$population)

   Andes Ethiopia    Tibet      USA 
      71      128       59     1704


stripchart(hemoglobin ~ population, data = data6, method = "jitter",
    vertical = TRUE)


#boxplot
boxplot(hemoglobin ~ population, data = data6)


par(bty = "l")
boxplot(hemoglobin ~ population, data = data6,
	col = "goldenrod1", boxwex = 0.5, whisklty = 1, outcol = "black", 
	outcex = 1, outlty = "blank", las = 1, 
	xlab="Male population", ylab = "Hemoglobin concentration (g/dL)")


#Select only data of Andes and USA
Andes.hem = data6$hemoglobin[data6$population == "Andes"]
#Andes.hem
USA.hem = data6$hemoglobin[data6$population == "USA"]
#USA.hem

#indepent 2-group t-test
t.test(Andes.hem, USA.hem)

table(data6$population)

	Welch Two Sample t-test

data:  Andes.hem and USA.hem
t = 17.723, df = 71.666, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 3.417099 4.283312
sample estimates:
mean of x mean of y 
 19.23099  15.38078

   Andes Ethiopia    Tibet      USA 
      71      128       59     1704

   Andes Ethiopia    Tibet      USA 
       0        0        0     1704


data7 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\MeaslesOutbreaks.csv", header = TRUE)
head(data7)


#time series plot
plot(confirmedCases ~ yearByQuarter, data = data7, type="l")


data8 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\SpiderAmputation.csv", header = TRUE)
head(data8)
table(data8$treatment)

 after before 
    16     16


data8$treatment <- factor(data8$treatment, levels = c("before", "after"))
boxplot(speed ~ treatment, data = data8)


speedBefore <- subset(data8, treatment == "before") 
speedBefore
speedAfter <- subset(data8, treatment == "after") 
speedAfter


median(speedBefore$speed)
median(speedAfter$speed)


quantile(speedBefore$speed, probs = c(0.25, 0.75), type = 5)
quantile(speedAfter$speed, probs = c(0.25, 0.75), type = 5)


t.test(speedBefore$speed, speedAfter$speed, paired = TRUE)

	Paired t-test

data:  speedBefore$speed and speedAfter$speed
t = -4.4166, df = 15, p-value = 5e-04
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.757801 -0.613449
sample estimates:
mean of the differences 
              -1.185625


data9 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\SticklebackPlates.csv", header = TRUE)
head(data9)
table(data9$genotype)

 mm  Mm  MM 
 88 174  82


data9$genotype <- factor(data9$genotype, levels = c("MM","Mm","mm"))
library(lattice)

histogram(plates ~ genotype, data = data9, col = "firebrick")

#separate the plot by genotypes
histogram(~ plates | genotype, data = data9, breaks = seq(0,70,by=2), 
	  layout = c(1, 3), col = "firebrick")


n <- tapply(data9$plates, INDEX = data9$genotype, FUN = length)
n


meanPlates <- tapply(data9$plates, INDEX = data9$genotype, FUN = mean)
meanPlates <- round(meanPlates, digits = 1)
meanPlates


medianPlates <- tapply(data9$plates, INDEX = data9$genotype, FUN = median)
medianPlates


sdPlates <- tapply(data9$plates, INDEX = data9$genotype, FUN = sd)
sdPlates <- round(sdPlates, 1) 
sdPlates


sticklebackTable <- data.frame(genotype = names(n), n = n, 
	mean = meanPlates, median = medianPlates, 
	sd = sdPlates)
sticklebackTable


sticklebackFreq <- data.frame(sticklebackFreq)
sticklebackFreq


sticklebackFreq$proportion <- sticklebackFreq$Freq / sum(sticklebackFreq$Freq)
sticklebackFreq


experimentalCount = c(82, 174, 88)
res <- chisq.test(experimentalCount, p = c(1/4, 1/2, 1/4))
res

	Chi-squared test for given probabilities

data:  experimentalCount
X-squared = 0.25581, df = 2, p-value = 0.8799


data10 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\GlidingSnakes.csv", header = TRUE)
head(data10)


hist(data10$undulationRateHz, right = FALSE)


round(mean(data10$undulationRate),2)
round(sd(data10$undulationRate),2)
round(var(data10$undulationRate),2)


data11 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\HumanGeneLengths.csv", header = TRUE)
head(data11)
nrow(data11)


paste("Mean of the gene length = ", round(mean(data11$geneLength),2), ".", sep = "")
paste("SD of the gene length = ", round(sd(data11$geneLength),2), ".", sep = "")
paste("Maximum length of genes = ", max(data11$geneLength), ".", sep = "")
paste("Minimum length of genes = ", min(data11$geneLength), ".", sep = "")


data12 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\XGeneContent.csv", header = TRUE)
tail(data12)
nrow(data12)
table(data12$chromosome)

 NotX     X 
19509   781


data12$chromosome <- factor(data12$chromosome, levels = c("X","NotX"))
geneContentTable <- table(data12$chromosome)
data.frame(Frequency = addmargins(geneContentTable))

#Chi-square goodness of fit test
chisq.test( geneContentTable, p = c(882, 19408)/20290 )

	Chi-squared test for given probabilities

data:  geneContentTable
X-squared = 12.091, df = 1, p-value = 0.0005066


data13 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\DesertBirdAbundance.csv", header = TRUE)
head(data13)
summary(data13)

                    species     abundance     
 American Kestrel       : 1   Min.   :  1.00  
 Ash-throated Flycatcher: 1   1st Qu.:  3.50  
 Bell's Vireo           : 1   Median : 18.00  
 Black-chin. Hummingbird: 1   Mean   : 74.77  
 Black-tail. Gnatcatcher: 1   3rd Qu.:102.50  
 Black-throated Sparrow : 1   Max.   :625.00  
 (Other)                :37


birdAbundanceTable <- table(cut(data13$abundance, breaks = seq(0,650,by=50), right = FALSE))
birdAbundanceTable

data.frame(Frequency = addmargins(birdAbundanceTable))

   [0,50)  [50,100) [100,150) [150,200) [200,250) [250,300) [300,350) [350,400) 
       28         4         3         3         1         2         1         0 
[400,450) [450,500) [500,550) [550,600) [600,650) 
        0         0         0         0         1


hist(data13$abundance, right = FALSE)

fatherOrnamentation	sonAttractiveness
0.35	-0.32
0.03	-0.03
0.14	0.11
0.10	0.28
0.22	0.31
0.23	0.18

id	hemoglobin	population
US.Sea.level1	10.40	USA
US.Sea.level2	11.20	USA
US.Sea.level3	11.70	USA
US.Sea.level4	11.80	USA
US.Sea.level5	11.90	USA
US.Sea.level6	12.05	USA

id	hemoglobin	population
US.Sea.level1	10.40	USA
US.Sea.level2	11.20	USA
US.Sea.level3	11.70	USA
US.Sea.level4	11.80	USA
US.Sea.level5	11.90	USA
US.Sea.level6	12.05	USA

year	quarter	confirmedCases	yearByQuarter
2011	4th	136	2011.88
2011	3rd	154	2011.62
2011	2nd	346	2011.38
2011	1st	151	2011.12
2010	4th	31	2010.88
2010	3rd	134	2010.62

person	activity
1	Disturbing tiger kill
2	Forest products
3	Grass/fodder
4	Fuelwood/timber
5	Grass/fodder
6	Forest products

Frequency.Var1	Frequency.Freq
Grass/fodder	44
Forest products	11
Fishing	8
Herding	7
Disturbing tiger kill	5
Fuelwood/timber	5
Sleeping in house	3
Walking	3
Toilet	2
Sum	88

year	sex	oceanAgeYears	lengthMm	massKg
1996	FALSE	3	513	3.090
1996	FALSE	3	513	2.909
1996	FALSE	3	525	3.056
1996	FALSE	3	501	2.690
1996	FALSE	3	513	2.876
1996	FALSE	3	501	2.978

	year	sex	oceanAgeYears	lengthMm	massKg
223	1996	FALSE	2	447	1.930
224	1996	FALSE	2	427	1.715
225	1996	FALSE	2	427	1.587
226	1996	FALSE	2	447	1.825
227	1996	FALSE	2	447	1.859
228	1996	FALSE	2	427	1.581

	year	sex	oceanAgeYears	lengthMm	massKg
7	1996	FALSE	2	427	1.610
8	1996	FALSE	2	457	2.156
9	1996	FALSE	2	427	1.563
10	1996	FALSE	2	447	1.763
11	1996	FALSE	2	437	1.790
13	1996	FALSE	2	457	1.906

bird	treatment	response
1	Control	Malaria
2	Control	Malaria
3	Control	Malaria
4	Control	Malaria
5	Control	Malaria
6	Control	Malaria

spider	speed	treatment
1	1.25	before
2	2.94	before
3	2.38	before
4	3.09	before
5	3.41	before
6	3.00	before

	spider	speed	treatment
17	1	2.40	after
18	2	3.50	after
19	3	4.49	after
20	4	3.17	after
21	5	5.26	after
22	6	3.22	after
23	7	2.32	after
24	8	3.31	after
25	9	3.70	after
26	10	4.70	after
27	11	4.94	after
28	12	5.06	after
29	13	3.22	after
30	14	3.52	after
31	15	5.45	after
32	16	3.40	after

species	abundance
Black Vulture	64
Turkey Vulture	23
Harris's Hawk	3
Red-tailed Hawk	16
American Kestrel	7
Gambel's Quail	148

Frequency.Var1	Frequency.Freq
[0,50)	28
[50,100)	4
[100,150)	3
[150,200)	3
[200,250)	1
[250,300)	2
[300,350)	1
[350,400)	0
[400,450)	0
[450,500)	0
[500,550)	0
[550,600)	0
[600,650)	1
Sum	43

serotoninLevel	treatmentTime
5.3	0
4.6	0
4.5	0
4.3	0
4.2	0
3.6	0

id	plates	genotype
4-1	11	mm
4-2	63	Mm
4-4	22	Mm
4-5	10	Mm
4-10	14	mm
4-12	11	mm

	genotype	n	mean	median	sd
MM	MM	82	62.8	63	3.4
Mm	Mm	174	50.4	59	15.1
mm	mm	88	11.7	11	3.6

genotype	Freq
MM	82
Mm	174
mm	88