การวิเคราะห์สถิติพื้นฐานด้วยภาษา R

ภาษา R นิยมนำมาใช้เพื่อวิเคราะห์และนำเสนอข้อมูลทางสถิติ เนื่องจากมีฟังก์ชั่นและแพคเกจที่เหมาะสมกับประเภทของข้อมูลให้เลือกใช้เป็นจำนวนมาก ก่อนเริ่มต้นวิเคราะห์ข้อมูลจะต้องนำข้อมูลเข้าสู่โปรแกรม ทำได้หลายวิธี เช่น ใช้ฟังก์ชั่น read.table() read.csv() หรือ read.xlsx() โดยสามารถดาวน์โหลดข้อมูลได้จากลิงค์ https://drive.google.com/drive/folders/1LXGFnzDfKpkxBCsRQFj-67vZ4eoLG0Os?usp=sharing โดยสามารถดาวน์โหลดโปรแกรม R ได้จาก (https://www.r-project.org/) และ RStudio (https://rstudio.com/)

# read input data
data1 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\locustSerotonin.csv", header = TRUE)

head(data1) #show the first part of the table
nrow(data1) #count number of rows

ฟังก์ชั่น summary() ใช้เพื่อรายงานสถิติเบื้องต้นของข้อมูล

#basic descriptive statistic of the data1
summary(data1)

 serotoninLevel   treatmentTime
 Min.   : 3.200   Min.   :0    
 1st Qu.: 4.675   1st Qu.:0    
 Median : 5.900   Median :1    
 Mean   : 8.407   Mean   :1    
 3rd Qu.:11.475   3rd Qu.:2    
 Max.   :21.300   Max.   :2

การแสดงข้อมูลด้วย stripchart() ระหว่าง serotoninLevel กับ treatmentTime ทำให้เห็นการกระจายของข้อมูลระดับฮอร์โมนที่ระยะเวลาต่างๆ

# plot a strip chart
stripchart(serotoninLevel ~ treatmentTime, data = data1, method = "jitter", vertical = TRUE)

เราสามารถคำนวณค่าเฉลี่ย (mean) ส่วนเบี่ยงเบนมาตรฐาน (sd) และค่าคลาดเคลื่อน (se) ของระดับฮอร์โมนในแต่ละช่วงเวลาได้ดังนี้

meanSerotonin <- tapply(data1$serotoninLevel, data1$treatmentTime, mean)
sdSerotonin <- tapply(data1$serotoninLevel, data1$treatmentTime, sd)
nSerotonin <- tapply(data1$serotoninLevel, data1$treatmentTime, length)
seSerotonin <- sdSerotonin / sqrt(nSerotonin)
meanSerotonin

ทำการเพิ่มข้อมูลค่าเฉลี่ยและค่าคลาดเคลื่อนลงใน stripchart ด้วยฟังก์ชั่น segments() และ points()

stripchart(serotoninLevel ~ treatmentTime, data = data1, method = "jitter", vertical = TRUE)
#Add error bars to the chart
offsetAmount <- 0.2
segments( c(c(1,2,3) + offsetAmount), meanSerotonin - seSerotonin, 
	  c(c(1,2,3) + offsetAmount), meanSerotonin + seSerotonin)
points(meanSerotonin ~ c(c(1,2,3) + offsetAmount), pch = 16, cex = 1.2)

การรู้จักและเข้าใจรูปแบบของข้อมูลเป็นสิ่งจำเป็น ก่อนเลือกวิธีวิเคราะห์ทางสถิติ จะต้องรู้จักลักษณะเบื้องต้นของข้อมูลที่มีก่อน (data exploration) รวมถึงหากจำเป็นต้องเตรียมหรือจัดรูปแบบของข้อมูลให้เหมาะสม(data cleaning and preparation) เช่น หากมีข้อมูลสูญหาย (missing data, NA) จากตัวอย่างของข้อมูล locustSerotonin ซึ่งแสดงข้อมูลระดับฮอร์โมน serotonin ในแมลงที่เวลา 0 1 และ 2 ชั่วโมง ข้อมูลชุดนี้มีสองตัวแปรคือ เวลาเป็นตัวแปรอิสระ และระดับของฮอร์โมนเป็นตัวแปรตาม เมื่อเข้าใจลักษณะของข้อมูลแล้ว จึงสามารถเลือกวิธีการทางสถิติเชิงบรรยาย (descriptive statistics) เพื่ออธิบายข้อมูลนั้นๆ ได้ เช่นการคำนวนค่าเฉลี่ย (mean) ส่วนเบี่ยงเบนมาตรฐาน (standard deviation) ค่ากลาง (mode) รวมทั้งการวิเคราะห์สถิติเชิงอนุมาน (Inferential statistics) เพื่อตอบสมมติฐานหรือหาข้อสรุปจากข้อมูลตัวอย่างที่มี เช่น การวิเคราะห์สหสัมพันธ์ (correlation) การวิเคราะห์ความแปรปรวน (analysis of variance)

หากมีคำถามว่าระดับฮอร์โมนจากสามช่วงเวลามีความแตกต่างกันหรือไม่ เมื่อทำการวิเคราะห์ ANOVA ด้วยฟังก์ชั่น aov() พบว่ายอมรับสมมติฐานหลักซึ่งกล่าวว่า ระดับของฮอร์โมนที่ระยะเวลาต่างๆ นั้นไม่แตกต่างกันที่ p < 0.05

data1.aov = aov(serotoninLevel ~ treatmentTime, data = data1)
summary(data1.aov)

              Df Sum Sq Mean Sq F value Pr(>F)  
treatmentTime  1   99.5   99.46   4.046  0.054 .
Residuals     28  688.3   24.58                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ข้อมูลชุดที่ 2 ข้อมูลเหตุการเสียชีวิตจากเสือในเนปาล เป็นข้อมูลเชิงคุณภาพที่ต้องนับหรือแจกแจงความถี่ (frequency)

#read input data
data2 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\DeathsFromTigers.csv", header = TRUE)

head(data2)

ฟังก์ชั่น table() ใช้สำหรับแจกแจงความถี่ของข้อมูลแล้วจัดเรียงค่าที่นับได้ด้วยฟังก์ชั่น sort() สามารถจัดรูปแบบเป็นตารางด้วยฟังก์ชั่น data.frame() เพิ่มผลรวมค่าความถี่ด้วยฟังก์ชั่น addmargins()

#generate frequency table
tigerTable <- sort(table(data2$activity), decreasing = TRUE)
tigerTable

#make a table format
data.frame(Frequency = addmargins(tigerTable))
tigerTable2 = data.frame(Frequency = addmargins(tigerTable))

         Grass/fodder       Forest products               Fishing 
                   44                    11                     8 
              Herding Disturbing tiger kill       Fuelwood/timber 
                    7                     5                     5 
    Sleeping in house               Walking                Toilet 
                    3                     3                     2

แสดงผลความถี่ด้วยกราฟแท่ง (bar plot)

# barplot
barplot(tigerTable, ylab = "Frequency", cex.names = 0.5, las = 2)

จากกราฟแท่งจะพบว่าสาเหตุการเสียชีวิตหลักในข้อมูลุดนี้คือ การตัดหญ้าหรือเก็บฟางเลี้ยงสัตว์ (grass/fodder) ข้อมูลที่มีปริมาณมากและหลากหลายเมื่อแสดงข้อมูลในรูปแบบกราฟจะทำให้เห็นรูปแบบและความสัมพันธ์ของข้อมูลชัดเจนขึ้น

ข้อมูลชุดที่ 3 เป็นข้อมูลความยาว (lengthMm) และน้ำหนัก (massKg) ของแซลมอนเพศเมียจำนวน 228 ตัว ที่มีอายุ 2 และ 3 ปี

data3 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\SalmonBodySize.csv", header = TRUE)

head(data3)
tail(data3) #show the end of the table
summary(data3)

      year         sex          oceanAgeYears      lengthMm         massKg     
 Min.   :1996   Mode :logical   Min.   :2.000   Min.   :389.0   Min.   :1.180  
 1st Qu.:1996   FALSE:228       1st Qu.:2.000   1st Qu.:427.0   1st Qu.:1.641  
 Median :1996                   Median :2.000   Median :447.0   Median :1.855  
 Mean   :1996                   Mean   :2.241   Mean   :450.4   Mean   :2.028  
 3rd Qu.:1996                   3rd Qu.:2.000   3rd Qu.:459.8   3rd Qu.:2.266  
 Max.   :1996                   Max.   :3.000   Max.   :550.0   Max.   :3.528

แสดงข้อมูลน้ำหนักเป็นค่าความถี่ด้วยฟังก์ชั่น hist() โดยกำหนดช่วงของน้ำหนักเป็น 0.1 0.3 และ 0.5 ด้วยฟังก์ชั่น seq() โดยกำหนดค่าต่ำสุดเป็น 1 และสูงสุดเป็น 4

# plot histograms
hist(data3$massKg, right = FALSE, breaks = seq(1,4,by=0.1), col = "firebrick")
hist(data3$massKg, right = FALSE, breaks = seq(1,4,by=0.3), col = "firebrick")
hist(data3$massKg, right = FALSE, breaks = seq(1,4,by=0.5), col = "firebrick")

ความสัมพันธ์ระหว่างขนาดและน้ำหนักสามารถตรวจดูเบื้องต้นด้วยการแสดงกราฟการกระจาย (scatter plot) ทำสอบความสัมพันธ์ระหว่างขนาดกับน้ำหนักด้วยการวิเคราะห์ correlation ด้วยฟังก์ชั่น cor() รวมถึงการเปรียบเทียบขนาดและน้ำหนักของปลาที่มีอายุ 2 และ 3 ปี ด้วยการวิเคราะห์ independent t-test โดยใช้ฟังชั่น t.test() พบว่าตัวแปรทั้งสองมีความสัมพันธ์กัน

#correlation
cor(data3$massKg, data3$lengthMm)

ทำการแยกข้อมูลของปลาอายุ 2 และ 3 ปี เก็บในตัวแปรใหม่ชื่อ age2 และ age3 แล้ววิเคราะห์สหสัมพันธ์ระหว่างขนาดและน้ำหนักของปลาแต่ละอายุ

age2 = data3[data3$oceanAgeYears == 2, ]
age3 = data3[data3$oceanAgeYears == 3, ]

head(age2)
head(age3)

cor(age2$massKg, age2$lengthMm)
cor(age3$massKg, age3$lengthMm)

แสดงความสัมพันธ์ของข้อมูลด้วย scatter plot เพื่อศึกษาการกระจายและแนวโน้มของข้อมูล โดยใช้ฟังก์ชั่น scatter.smooth()

scatter.smooth(age2$massKg, age2$lengthMm)
scatter.smooth(age3$massKg, age3$lengthMm)

# independent t-test
t.test(age2$massKg, age3$massKg)

	Welch Two Sample t-test

data:  age2$massKg and age3$massKg
t = -24.659, df = 73.687, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.203761 -1.023754
sample estimates:
mean of x mean of y 
 1.759497  2.873255

t.test(age2$lengthMm, age3$lengthMm)

	Welch Two Sample t-test

data:  age2$lengthMm and age3$lengthMm
t = -24.787, df = 84.802, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -72.87186 -62.04901
sample estimates:
mean of x mean of y 
 434.1214  501.5818

จากการทดสอบ t-test ดังกล่าวพบว่าขนาดและน้ำหนักของปลาอายุ 2 และ 3 ปีมีความแตกต่างกันอย่างมีนัยสำคัญยิ่งทางสถิติที่ p < 0.01 หากต้องการวิเคราะห์ความสัมพันธ์ของตัวแปรทั้งสองเพื่อใช้ทำนายค่าน้ำหนักหรือขนาดที่ไม่ทราบมาก่อนได้ ด้วยการสร้างโมเดลเชิงเส้น (linear model) โดยใช้ฟังก์ชั่น lm()

#Build linear model y= ax + b
linearModSalmon <- lm(massKg ~ lengthMm, data=data3)  
print(linearModSalmon)

Call:
lm(formula = massKg ~ lengthMm, data = data3)

Coefficients:
(Intercept)     lengthMm  
   -4.92825      0.01545

จากโมเดลดังกล่าวพบความสัมพันธ์ น้ำหนัก = 0.02(ขนาด) - 4.93 สามารถใช้ความสัมพันธ์นี้ทำนายค่าน้ำหนักหรือขนาดที่ไม่ทราบมาก่อนได้

ข้อมูลชุดที่ 4 เป็นข้อมูลการติดเชื้อมาลาเรียในนกแสดงความสัมพันธ์ของการแยกไข่นกออกกับการติดเชื้อมาลาเรียในนก

data4 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\BirdMalaria.csv", header = TRUE)
head(data4)

ในกรณีนี้ข้อมูลมาจากการทดลองแบบ factorial design 2x2 ซึ่งทดสอบสองปัจจัย (factors) และแต่ละปัจจัยมีสองระดับ (levels) ใช้ฟังก์ชั่น factor() ในการเปลี่ยนข้อมูลกลุ่ม (treatment) ให้เป็น levels สร้างตาราง contingency ด้วยฟังก์ชั่น table() และคำนวนผลรวมด้วยฟังก์ชั่น addmargins()

#convert treatment to factor
data4$treatment <- factor(data4$treatment, levels= c("Egg removal", "Control"))

# make a contingency table
birdMalariaTable <- table(data4$response, data4$treatment)
birdMalariaTable

#add row and column summation
addmargins(birdMalariaTable, margin = c(1,2), FUN = sum, quiet = TRUE)

            
             Egg removal Control
  Malaria             15       7
  No Malaria          15      28

เมื่อแสดงผลด้วยกราฟแท่งของทั้งสองปัจจัยจะเห็นว่า การไม่แยกไข่ออก (control) ทำให้นกเป็นมาลาเรียน้อยลง หากต้องการทดสอบว่าสองปัจจัยนี้มีความเกี่ยวข้องหรือไม่ ทำได้ด้วยการทดสอบไคน์สแควร์ โดยใช้ฟังก์ชั่น chisq.test()

#barplot
barplot(as.matrix(birdMalariaTable), beside = TRUE)

#Test of Independence (Chi-Square Test) 
#Test independence of the row and column variable
chisq.test(birdMalariaTable)

fisher.test(birdMalariaTable)

	Pearson's Chi-squared test with Yates' continuity correction

data:  birdMalariaTable
X-squared = 5.2224, df = 1, p-value = 0.0223

	Fisher's Exact Test for Count Data

data:  birdMalariaTable
p-value = 0.01739
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
  1.188903 14.074179
sample estimates:
odds ratio 
  3.909422

ผลการทดสอบพบค่า p value น้อยกว่า 0.05 แสดงว่าปัจจัยทั้งสองนั้นไม่เป็นอิสระต่อกัน หรืออาจเกี่ยวข้องกันนั่นเอง

ข้อมูลชุดที่ 5 เป็นข้อมูลลวดลายของพ่อปลาหางนกยูงกับลาดลายของลูกปลา

data5 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\GuppyFatherSonAttractiveness.csv", header = TRUE)
head(data5)

ดูการกระจายของข้อมูลด้วยฟังก์ชั่น plot() และเมื่อทดสอบสหสัมพันธ์พบว่าทั้งสองตัวแปรมีความสัมพันธ์กันในระดับหนึ่ง

#scatter plot
plot(sonAttractiveness ~ fatherOrnamentation, data = data5)

#Correlations
cor(data5$sonAttractiveness, data5$fatherOrnamentation)

เมื่อนำข้อมูลของทั้งสองตัวแปรมาสร้างโมเดลเชิงเส้นด้วยฟังก์ชั่น lm() จะได้ความสัมพันธ์เป็น sonAttractiveness = 0.98(fatherOrnamentation) + 0.01

#Build linear model y= ax + b
linearModGuppy <- lm(sonAttractiveness ~ fatherOrnamentation, data=data5)  
print(linearModGuppy)

Call:
lm(formula = sonAttractiveness ~ fatherOrnamentation, data = data5)

Coefficients:
        (Intercept)  fatherOrnamentation  
           0.005084             0.982285

ข้อมูลชุดที่ 6 ระดับของฮีโมโกลบินกับความสูงจากระดับน้ำทะเล โดยประกอบด้วยข้อมูลจากพื้นที่สูงจากระดับน้ำทะเล 3 ตำแหน่งเทียบกับพื้นที่ที่ระดับน้ำทะเลปกติ

data6 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\HumanHemoglobinElevation.csv", header = TRUE)
head(data6)

#number of samples
table(data6$population)

   Andes Ethiopia    Tibet      USA 
      71      128       59     1704

stripchart(hemoglobin ~ population, data = data6, method = "jitter",
    vertical = TRUE)

จาก strip chart พบว่าระดับของฮีโมโกลบินของตัวอย่างจาก Andes มีระดับสูงกว่าพื้นที่อื่นๆ และจะเห็นความแตกต่างนี้ชัดเจนมากขึ้นเมื่อแดงผลข้อมูลด้วยฟังก์ชั่น boxplot() เพื่อแสดงผลระดับฮีโมโกลบินตามกลุ่มตัวอย่างประชากร

#boxplot
boxplot(hemoglobin ~ population, data = data6)

ตกแต่งกราฟด้วยการปรับพารามิเตอร์ของฟังก์ชั่น boxplot() ได้แก่ สี (col) ขนาดของกล่องข้อมูล (boxwex) สีและขนาดของข้อมูลที่เป็น outlier (outcol, outcex, outlty) และชื่อแกน (xlab และ ylab)

par(bty = "l")
boxplot(hemoglobin ~ population, data = data6,
	col = "goldenrod1", boxwex = 0.5, whisklty = 1, outcol = "black", 
	outcex = 1, outlty = "blank", las = 1, 
	xlab="Male population", ylab = "Hemoglobin concentration (g/dL)")

หากต้องการทดสอบสมมติฐานว่าระดับของฮีโมโกลบินของประชากรจาก Andes แตกต่างจาก USA หรือไม่ ทำได้โดยเลือกข้อมูลส่วนที่สนใจแล้วทำการทดสอบ t-test ผลการวิเคราะห์พบว่าระดับฮีโมโกลบินของสองพื้นที่นี้มีความแตกต่างกันอย่างมีนัยสำคัญ

#Select only data of Andes and USA
Andes.hem = data6$hemoglobin[data6$population == "Andes"]
#Andes.hem
USA.hem = data6$hemoglobin[data6$population == "USA"]
#USA.hem

#indepent 2-group t-test
t.test(Andes.hem, USA.hem)

	Welch Two Sample t-test

data:  Andes.hem and USA.hem
t = 17.723, df = 71.666, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 3.417099 4.283312
sample estimates:
mean of x mean of y 
 19.23099  15.38078

ข้อมูลชุดที่ 7 การระบาดของโรคหัดในประเทศอังกฤษและเวลส์ระหว่างปี ค.ศ. 1995-2011 ข้อมูลแบ่งเป็นรายสามเดือนในแต่ละปี

data7 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\MeaslesOutbreaks.csv", header = TRUE)
head(data7)

เมื่อแสดงข้อมูลจำนวนเคสการระบาดของโรคในช่วงเวลาที่สนใจด้วยกราฟเส้น (line plot) ในฟังก์ชั่น plot() จะเห็นว่าโรคนี้มีแนวโน้มการระบาดเพิ่มขึ้นมากจากอดีต

#time series plot
plot(confirmedCases ~ yearByQuarter, data = data7, type="l")

ข้อมูลชุดที่ 8 เป็นข้อมูลเปรียบเทียบความเร็วการวิ่งของแมงมุมก่อนและหลังตัดรยางค์

data8 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\SpiderAmputation.csv", header = TRUE)
head(data8)
table(data8$treatment)

 after before 
    16     16

เมื่อแบ่งข้อมูล treatment เป็นสองกลุ่ม แล้วแสดงข้อมูลด้วยฟังก์ชั่น boxplot() พบว่าความเร็วของแมงมุมหลังตัดรยางค์จะมากกว่าก่อนตัดรยางค์

data8$treatment <- factor(data8$treatment, levels = c("before", "after"))
boxplot(speed ~ treatment, data = data8)

แบ่งข้อมูลนี้เป็นก่อนและหลังตัดรยางค์ เก็บไว้ในตัวแปร sppedBefore และ speedAfter

speedBefore <- subset(data8, treatment == "before") 
speedBefore
speedAfter <- subset(data8, treatment == "after") 
speedAfter

คำนวนค่ากลางและควอนไทล์ของความเร็วก่อนและหลังด้วยฟังก์ชั่น median() และ quantile()

median(speedBefore$speed)
median(speedAfter$speed)

quantile(speedBefore$speed, probs = c(0.25, 0.75), type = 5)
quantile(speedAfter$speed, probs = c(0.25, 0.75), type = 5)

เนื่องจากข้อมูลนี้เป็นการเปรียบเทียบความเร็วของแมงมุมก่อนและหลังการตัดรยางค์ หากต้องการทดสอบทางสถิติว่าการทดลองก่อนและหลังนี้มีความเร็วแตกต่างกันหรือไม่ สามารถทำได้ด้วยการทดสอบ paired t-test ผลการทดสอบพบว่าความเร็วก่อนและหลังนั้นแตกต่างกันอย่างมีนัยสำคัญ

t.test(speedBefore$speed, speedAfter$speed, paired = TRUE)

	Paired t-test

data:  speedBefore$speed and speedAfter$speed
t = -4.4166, df = 15, p-value = 5e-04
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.757801 -0.613449
sample estimates:
mean of the differences 
              -1.185625

ข้อมูลชุดที่ 9 เป็นข้อมูลลักษณะ lateral plates ของปลา stickleback ที่มีจีโนไทป์ MM Mm และ mm

data9 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\SticklebackPlates.csv", header = TRUE)
head(data9)
table(data9$genotype)

 mm  Mm  MM 
 88 174  82

ทำการแปลงข้อมูล genotype ให้เป็นข้อมูลประเภท factor ด้วยฟังก์ชั่น factor() แล้วแสดง histogram ด้วยฟังก์ชั่น histogram() ในแพคเกจ lattice ที่เรียกใช้งานโดยฟังก์ชั่น library() จากกราฟจะเห็นว่าสัญลักษณ์ | genotype ใช้สำหรับแยกการแสดงข้อมูลตามกลุ่มของ genotype จาก histogram จะเห็นว่า lateral plate ของปลาทั้งสามกลุ่มจีโนไทป์มีความแตกต่างกัน

data9$genotype <- factor(data9$genotype, levels = c("MM","Mm","mm"))
library(lattice)

histogram(plates ~ genotype, data = data9, col = "firebrick")

#separate the plot by genotypes
histogram(~ plates | genotype, data = data9, breaks = seq(0,70,by=2), 
	  layout = c(1, 3), col = "firebrick")

ในกรณีตัวแปรหน่งมีข้อมูลหลายระดับหรือ levels สามารถใช้ฟังก์ชั่น tapply() เพื่อดำเนินการฟังก์ชั่นที่ต้องการ (FUN) ได้แก่ mean() length() sd() median() กับข้อมูลประเภท factor ซึ่งมีหลาย levels (INDEX) จากการวิเคราะห์พบว่า genotype Mm จะมีค่าความยาวเฉลี่ยของ lateral plate อยู่ระหว่าง genotypes MM และ mm

n <- tapply(data9$plates, INDEX = data9$genotype, FUN = length)
n

meanPlates <- tapply(data9$plates, INDEX = data9$genotype, FUN = mean)
meanPlates <- round(meanPlates, digits = 1)
meanPlates

medianPlates <- tapply(data9$plates, INDEX = data9$genotype, FUN = median)
medianPlates

sdPlates <- tapply(data9$plates, INDEX = data9$genotype, FUN = sd)
sdPlates <- round(sdPlates, 1) 
sdPlates

สรุปข้อมูลจากการวิเคราะห์สถิติเชิงบรรยายเป็นตารางด้วยฟังก์ชั่น data.frame()

sticklebackTable <- data.frame(genotype = names(n), n = n, 
	mean = meanPlates, median = medianPlates, 
	sd = sdPlates)
sticklebackTable

sticklebackFreq <- table(data9$genotype, dnn = "genotype") sticklebackFreq

sticklebackFreq <- data.frame(sticklebackFreq)
sticklebackFreq

คำนวนสัดส่วนความถี่ของ genotype แต่ละแบบ

sticklebackFreq$proportion <- sticklebackFreq$Freq / sum(sticklebackFreq$Freq)
sticklebackFreq

หากต้องการทดสอบว่าสัดส่วนของ genotype จากข้อมูลชุดนี้สอดคล้องกับค่าสัดส่วนทางทฤษฎีหากการถ่ายทอดทางพันธุกรรมของยีนนี้ควบคุมตามกฏของเมนเดลหรือไม่ สามารถทำได้โดยการทดสอบไคน์ สแคร์ (goodness of fit test) ด้วยฟังก์ชั่น chisq.test() ว่าสัดส่วนของ MM: Mm: mm เท่ากับ 1:2:1 ซึ่งผลการทดสอบพบว่าค่า p value มากกว่า 0.05 จงสามารถสรุปได้ว่าค่าสัดส่วนที่ได้นั้นไม่แตกต่างจากค่าทางทฤษฎี

experimentalCount = c(82, 174, 88)
res <- chisq.test(experimentalCount, p = c(1/4, 1/2, 1/4))
res

	Chi-squared test for given probabilities

data:  experimentalCount
X-squared = 0.25581, df = 2, p-value = 0.8799

ข้อมูลชุดที่ 10 เป็นข้อมูลอัตราความถี่ในการเลื้อย (undulationRateHz) ของงูจำนวน 8 ตัว แสดงการกระจายของข้อมูลด้วย histogram โดยใช้ฟังก์ชั่น hist()

data10 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\GlidingSnakes.csv", header = TRUE)
head(data10)

hist(data10$undulationRateHz, right = FALSE)

ข้อมูลนี้มีเพียงตัวแปรเดียว สามารถคำนวนค่าสถิติเชิงบรรยายเบื้องต้นได้ พบว่าอัตราความถี่ในการเลื้อยของงูกลุ่มนี้เฉลี่ยอยู่ที่ 1.375 มีค่าส่วนเบี่ยงเบนมาตรฐานและความแปรปรวนเท่ากับ 0.324 และ 0.105 ใช้ฟังก์ชั่น round() ในการกำหนดตำแหน่งทศนิยม

round(mean(data10$undulationRate),2)
round(sd(data10$undulationRate),2)
round(var(data10$undulationRate),2)

ข้อมูลชุดที่ 11 เป็นข้อมูลความยาวของยีนในมนุษย์ ฟังก์ชั่น nrow() แสดงจำนวนแถวในไฟล์ของข้อมูลนี้ เป็นข้อมูลเชิงปริมาณที่มีเพียงตัวแปรเดียว

data11 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\HumanGeneLengths.csv", header = TRUE)
head(data11)
nrow(data11)

ในการแสดงผลการวิเคราะห์สามารถเพิ่มข้อความหรือคำอธิบายเพื่อให้เข้าใจผลการวิเคราะห์ง่ายขึ้น โดยใช้ฟังก์ชั่น paste()

paste("Mean of the gene length = ", round(mean(data11$geneLength),2), ".", sep = "")
paste("SD of the gene length = ", round(sd(data11$geneLength),2), ".", sep = "")
paste("Maximum length of genes = ", max(data11$geneLength), ".", sep = "")
paste("Minimum length of genes = ", min(data11$geneLength), ".", sep = "")

ข้อมูลชุดที่ 12 เป็นข้อมูลของยีนที่พบได้บนโครโมโซม X สามารถสรุปข้อมูลความถี่เบื้องต้นได้ด้วยฟังก์ชั่น table()

data12 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\XGeneContent.csv", header = TRUE)
tail(data12)
nrow(data12)
table(data12$chromosome)

 NotX     X 
19509   781

เมื่อจัดข้อมูลเป็นตารางแสดงความถี่ หากมีสมมติฐานว่ายีนกระจายสม่ำเสมอด้วยจำนวนพอๆ กันในแต่ละโครโมโซมเช่น 882 ยีนต่อโครโมโซม แล้วจำนวนยีนที่พบบนโครโมโซม X สอดคล้องตามนี้หรือไม่ สามารถใช้การทดสอบไคน์ สแควร์เพื่อทดสอบสมมติฐานนี้ได้ ผลการทดสอบแสดงให้เห็นว่ายีนบนโครโมโซม X มีจำนวนไม่เท่ากับยีนบนโครโมโซมอื่นๆ

data12$chromosome <- factor(data12$chromosome, levels = c("X","NotX"))
geneContentTable <- table(data12$chromosome)
data.frame(Frequency = addmargins(geneContentTable))

#Chi-square goodness of fit test
chisq.test( geneContentTable, p = c(882, 19408)/20290 )

	Chi-squared test for given probabilities

data:  geneContentTable
X-squared = 12.091, df = 1, p-value = 0.0005066

ข้อมูลชุดที่ 13 เป็นข้อมูลการกระจายของนกทะเลทรายในสถานที่แห่งหนึ่ง

data13 = read.csv("C:\\Users\\TeerasakArt\\Downloads\\DesertBirdAbundance.csv", header = TRUE)
head(data13)
summary(data13)

                    species     abundance     
 American Kestrel       : 1   Min.   :  1.00  
 Ash-throated Flycatcher: 1   1st Qu.:  3.50  
 Bell's Vireo           : 1   Median : 18.00  
 Black-chin. Hummingbird: 1   Mean   : 74.77  
 Black-tail. Gnatcatcher: 1   3rd Qu.:102.50  
 Black-throated Sparrow : 1   Max.   :625.00  
 (Other)                :37

ทำการจัดข้อมูลนี้ใหม่โดยแบ่งช่วงความถี่เป็นช่วงละ 50 แล้วนับว่ามีนกจำนวนกี่ชนิดที่มีจำนวนอยู่ในช่วงที่แบ่งไว้ แล้วแสดงข้อมูลด้วย histogram จากกราฟจะเห็นว่ามีนกหนึ่งชนิดที่มีจำนวนเยอะมากกว่าชนิดอื่นๆ

birdAbundanceTable <- table(cut(data13$abundance, breaks = seq(0,650,by=50), right = FALSE))
birdAbundanceTable

data.frame(Frequency = addmargins(birdAbundanceTable))

   [0,50)  [50,100) [100,150) [150,200) [200,250) [250,300) [300,350) [350,400) 
       28         4         3         3         1         2         1         0 
[400,450) [450,500) [500,550) [550,600) [600,650) 
        0         0         0         0         1

hist(data13$abundance, right = FALSE)

fatherOrnamentation	sonAttractiveness
0.35	-0.32
0.03	-0.03
0.14	0.11
0.10	0.28
0.22	0.31
0.23	0.18

id	hemoglobin	population
US.Sea.level1	10.40	USA
US.Sea.level2	11.20	USA
US.Sea.level3	11.70	USA
US.Sea.level4	11.80	USA
US.Sea.level5	11.90	USA
US.Sea.level6	12.05	USA

year	quarter	confirmedCases	yearByQuarter
2011	4th	136	2011.88
2011	3rd	154	2011.62
2011	2nd	346	2011.38
2011	1st	151	2011.12
2010	4th	31	2010.88
2010	3rd	134	2010.62

person	activity
1	Disturbing tiger kill
2	Forest products
3	Grass/fodder
4	Fuelwood/timber
5	Grass/fodder
6	Forest products

Frequency.Var1	Frequency.Freq
Grass/fodder	44
Forest products	11
Fishing	8
Herding	7
Disturbing tiger kill	5
Fuelwood/timber	5
Sleeping in house	3
Walking	3
Toilet	2
Sum	88

year	sex	oceanAgeYears	lengthMm	massKg
1996	FALSE	3	513	3.090
1996	FALSE	3	513	2.909
1996	FALSE	3	525	3.056
1996	FALSE	3	501	2.690
1996	FALSE	3	513	2.876
1996	FALSE	3	501	2.978

	year	sex	oceanAgeYears	lengthMm	massKg
223	1996	FALSE	2	447	1.930
224	1996	FALSE	2	427	1.715
225	1996	FALSE	2	427	1.587
226	1996	FALSE	2	447	1.825
227	1996	FALSE	2	447	1.859
228	1996	FALSE	2	427	1.581

	year	sex	oceanAgeYears	lengthMm	massKg
7	1996	FALSE	2	427	1.610
8	1996	FALSE	2	457	2.156
9	1996	FALSE	2	427	1.563
10	1996	FALSE	2	447	1.763
11	1996	FALSE	2	437	1.790
13	1996	FALSE	2	457	1.906

bird	treatment	response
1	Control	Malaria
2	Control	Malaria
3	Control	Malaria
4	Control	Malaria
5	Control	Malaria
6	Control	Malaria

spider	speed	treatment
1	1.25	before
2	2.94	before
3	2.38	before
4	3.09	before
5	3.41	before
6	3.00	before

	spider	speed	treatment
17	1	2.40	after
18	2	3.50	after
19	3	4.49	after
20	4	3.17	after
21	5	5.26	after
22	6	3.22	after
23	7	2.32	after
24	8	3.31	after
25	9	3.70	after
26	10	4.70	after
27	11	4.94	after
28	12	5.06	after
29	13	3.22	after
30	14	3.52	after
31	15	5.45	after
32	16	3.40	after

species	abundance
Black Vulture	64
Turkey Vulture	23
Harris's Hawk	3
Red-tailed Hawk	16
American Kestrel	7
Gambel's Quail	148

Frequency.Var1	Frequency.Freq
[0,50)	28
[50,100)	4
[100,150)	3
[150,200)	3
[200,250)	1
[250,300)	2
[300,350)	1
[350,400)	0
[400,450)	0
[450,500)	0
[500,550)	0
[550,600)	0
[600,650)	1
Sum	43

serotoninLevel	treatmentTime
5.3	0
4.6	0
4.5	0
4.3	0
4.2	0
3.6	0

id	plates	genotype
4-1	11	mm
4-2	63	Mm
4-4	22	Mm
4-5	10	Mm
4-10	14	mm
4-12	11	mm

	genotype	n	mean	median	sd
MM	MM	82	62.8	63	3.4
Mm	Mm	174	50.4	59	15.1
mm	mm	88	11.7	11	3.6

genotype	Freq	proportion
MM	82	0.2383721
Mm	174	0.5058140
mm	88	0.2558140