### Analysis of the babyboom dataset ### rm(list = ls()) babyboom <- read.table('babyboom.dat') head(babyboom) names(babyboom) names(babyboom)<-c('timebirth', 'gender', 'weight', 'min.after.mid') head(babyboom) babyboom$gender babyboom$g is.factor(babyboom$g) babyboom$gender <- factor(babyboom$gender, labels=c('girl', 'boy')) babyboom$g mode(babyboom) class(babyboom) x<-babyboom$weight factorx <- factor(cut(x, breaks=nclass.Sturges(x))) table(factorx) factorx as.matrix(table(factorx) factorx <- factor(cut(x, breaks=nclass.Sturges(x), dig.lab=5)) as.matrix(table(factorx)) factorx <- factor(cut(x, breaks=nclass.scott(x), dig.lab=5)) as.matrix(table(factorx)) factorx <- factor(cut(x, breaks=nclass.FD(x), dig.lab=5)) as.matrix(table(factorx)) nclasses <- 10 step <- (max(x)-min(x))/nclasses factorx <- factor(cut(x, breaks=seq( from=min(x), to = max(x), by = step), include.lowest = TRUE, dig.lab=5, right=F)) as.matrix(table(factorx)) hist(x, breaks=seq( min(x), max(x), step ) ) hist(x, breaks=seq( min(x), max(x), step ) , freq=F) hist(x, breaks=seq(min(x), max(x), step), col="lightblue", border="blue", main="Histogram of weight", xlab="Weight (gm)", ylab="Counts") hist(x, breaks=seq( min(x), max(x), step ) , freq=F) lines(density(x)) z<- seq( min(c(0,x)), max(x), length.out=100) lines( z, dnorm(z,mean(x),sd(x)), lty=2, col=2 ) par(mfrow=c(2,1)) hist(x, breaks=seq( min(x), max(x), step ) , freq=F) plot(density(x)) ############################################## #Tabulate and turn into data.frame Freq <- table(factorx) rel.Freq <- prop.table(Freq) xout <- data.frame(Freq=as.numeric(Freq), cum.Freq = cumsum(Freq), rel.Freq = as.numeric(rel.Freq), cum.rel.Freq=cumsum(rel.Freq)) round(xout,3) table(babyboom$weight) # it does not make sense! # Need to change the continuous variable to a factor # with the cut command firstly #library(sjPlot) library(sjmisc) frq(factorx, title="Birth Weight", out = "v") frq(babyboom$weight, title="Birth Weight", out = "v", auto.grp=10) ## Descriptive statistics x<-babyboom$weight mean(x) median(x) var(x) sd(x) mad(x) # median absolute deviation IQR(x) range(x) min(x) max(x) pp=c( 0.005, 0.025, seq(0.05,0.95,0.05), 0.975, 0.995 ) quantile(x, probs=pp) quantile(x, probs=seq(0.1,0.90,0.1) ) # Descriptives in R using data frames q<-sapply(babyboom, class) q bq <- babyboom[, (q=='numeric')|(q=='integer')] x<- bq head(x) sapply(x, mean) sapply(x, median) sapply(x, var) sapply(x, sd) sapply(x, mad) sapply(x, IQR) sapply(x, range) sapply(x, min) sapply(x, max) pp<-seq(0.1,0.90,0.1) sapply(x, quantile, probs=pp) summary(babyboom) #install.packages("psych") library(psych) describe(babyboom) describe.by(babyboom, babyboom$gender) boxplot(x) boxplot(x, horizontal=TRUE) boxplot(weight~gender, data = babyboom) boxplot(weight~gender, data = babyboom, col=c(2,3)) par(mfrow=c(1,2)) boxplot(weight~gender, data=babyboom) boxplot(min.after.mid~gender, data=babyboom) dev.off() ## Categorical variables x<-babyboom$gender Freq <- table(x) rel.Freq <- prop.table(Freq) xout <- data.frame(Freq=as.numeric(Freq), rel.Freq = as.numeric(rel.Freq)) row.names(xout) <- names(Freq) round(xout,3) library(sjmisc) frq(babyboom$gender, title="Gender of newborn", out = "v")