Homework_06.Rmd

Run the sample code

# Open Libraries
# library(ggplot2) # for graphics
# library(MASS) # for maximum likelihood estimation

# Read in data vector: quick and dirty, a truncated normal distribution to work on the solution set
# z <- rnorm(n=3000,mean=0.2)
# z <- data.frame(1:3000,z)
# names(z) <- list("ID","myVar")
# z <- z[z$myVar>0,]
# str(z)
# 
# # Plot histogram of data
# p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
#   geom_histogram(color="grey60",fill="cornsilk",size=0.2)
# print(p1)
# 
# # Add empirical density curve 
# p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
# print(p1)

# # Get maximum likelihood parameters for "normal" 
# normPars <- fitdistr(z$myVar,"normal")
# print(normPars)
# str(normPars)
# normPars$estimate["mean"] # note structure of getting a named attribute
# 
# # Plot "normal" probability density 
# meanML <- normPars$estimate["mean"]
# sdML <- normPars$estimate["sd"]
# 
# xval <- seq(0,max(z$myVar),len=length(z$myVar))
# 
# #  stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
#  p1 + stat

# # Plot "exponential" probability density
# expoPars <- fitdistr(z$myVar,"exponential")
# rateML <- expoPars$estimate["rate"]
# 
# stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
#  p1 + stat + stat2
#  
# # Plot "uniform" probability density 
#  stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
#  p1 + stat + stat2 + stat3
 
# # Plot "gamma" probability density 
#  gammaPars <- fitdistr(z$myVar,"gamma")
# shapeML <- gammaPars$estimate["shape"]
# rateML <- gammaPars$estimate["rate"]

# stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
#  p1 + stat + stat2 + stat3 + stat4
 
# # Plot "beta" probability density 
# pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
#   geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
#   xlim(c(0,1)) +
#   geom_density(size=0.75,linetype="dotted")

# betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
# shape1ML <- betaPars$estimate["shape1"]
# shape2ML <- betaPars$estimate["shape2"]

# statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
# pSpecial + statSpecial

Trying it with my own data

z <- read.table("MyDataFile.csv",header=TRUE,sep=",")
str(z)

## 'data.frame':    40 obs. of  2 variables:
##  $ SAMPLE      : chr  "CCS2011-023" "CCS2011-041" "CCS2009-033" "CCS2010-086" ...
##  $ AGE_IN_YEARS: num  0.4 0.4 1.3 1.5 2.2 2.5 3.5 3.6 4.4 4.6 ...

summary(z)

##     SAMPLE           AGE_IN_YEARS   
##  Length:40          Min.   : 0.400  
##  Class :character   1st Qu.: 5.275  
##  Mode  :character   Median :10.350  
##                     Mean   :12.375  
##                     3rd Qu.:20.300  
##                     Max.   :30.300

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
p1 <- ggplot(data=z, aes(x=AGE_IN_YEARS, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1) # histogram of the age of humpback whales in a study. A histogram can only have one quantitative value.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1) # an empirical density curve to smooth out the data

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

normPars <- fitdistr(z$AGE_IN_YEARS,"normal") # z$AGE_IN_YEARS
print(normPars)

##       mean          sd    
##   12.3750000    8.8179008 
##  ( 1.3942325) ( 0.9858713)

str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 12.38 8.82
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 1.394 0.986
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 1.944 0 0 0.972
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 40
##  $ loglik  : num -144
##  - attr(*, "class")= chr "fitdistr"

normPars$estimate["mean"] # the mean age of humpback whales was 12.375 years

##   mean 
## 12.375

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$AGE_IN_YEARS),len=length(z$AGE_IN_YEARS))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$AGE_IN_YEARS), args = list(mean = meanML, sd = sdML))
 p1 + stat # Normal probability density

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 expoPars <- fitdistr(z$AGE_IN_YEARS,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$AGE_IN_YEARS), args = list(rate=rateML))
 p1 + stat + stat2 # exponential curve

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$AGE_IN_YEARS), args = list(min=min(z$AGE_IN_YEARS), max=max(z$AGE_IN_YEARS)))
 p1 + stat + stat2 + stat3 # uniform distribution

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 gammaPars <- fitdistr(z$AGE_IN_YEARS,"gamma")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$AGE_IN_YEARS), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4 # Gamma probability density. Looks most similar to data

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 pSpecial <- ggplot(data=z, aes(x=AGE_IN_YEARS/(max(AGE_IN_YEARS + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$AGE_IN_YEARS/max(z$AGE_IN_YEARS + 0.1),start=list(shape1=1,shape2=2),"beta")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$AGE_IN_YEARS), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial # This is a very weird curve.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (geom_bar).

Running My Data as Gamma

z <- rgamma(40,1.396564, rate = 0.1128381)
z <- data.frame(1:3000,z)
names(z) <- list("SAMPLE", "AGE_IN_YEARS")
z <- z[z$AGE_IN_YEARS>0,]
str(z)

## 'data.frame':    3000 obs. of  2 variables:
##  $ SAMPLE      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ AGE_IN_YEARS: num  12.84 21.43 19.95 27.64 2.94 ...

summary(z$AGE_IN_YEARS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.138   6.410  12.428  15.866  22.776  46.897

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
p1 <- ggplot(data=z, aes(x=AGE_IN_YEARS, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1) # histogram of the age of humpback whales in a study. A histogram can only have one quantitative value.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1) # an empirical density curve to smooth out the data

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

w <- read.table("MyDataFile.csv",header=TRUE,sep=",")
str(w)

## 'data.frame':    40 obs. of  2 variables:
##  $ SAMPLE      : chr  "CCS2011-023" "CCS2011-041" "CCS2009-033" "CCS2010-086" ...
##  $ AGE_IN_YEARS: num  0.4 0.4 1.3 1.5 2.2 2.5 3.5 3.6 4.4 4.6 ...

summary(w)

##     SAMPLE           AGE_IN_YEARS   
##  Length:40          Min.   : 0.400  
##  Class :character   1st Qu.: 5.275  
##  Mode  :character   Median :10.350  
##                     Mean   :12.375  
##                     3rd Qu.:20.300  
##                     Max.   :30.300

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
p1 <- ggplot(data=w, aes(x=AGE_IN_YEARS, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1) # histogram of the age of humpback whales in a study. A histogram can only have one quantitative value.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1) # an empirical density curve to smooth out the data

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# The histograms are quite different. I was deciding between using uniform or gamma distribution for my simulation and decided to do gamma. The original graph looks relatively uniform, but does have a trend from high to low going left to right. There were two very small humps in the probability curve for the raw data histogram, but overall there was a single curve trending downward.  However, the gamma histogram had two large humps at the left of the histogram and three smaller ones to the right of the histogram according to the probability curve. The data were concentrated at the left of the histogram, with very few values to the right. Overall, the graph of the simulated data using the gamma distribution did not fit the data well and looked very different from the original histogram of the data.

Homework_06.Rmd

Grace Durant

2/23/2022

Run the sample code

Trying it with my own data

Running My Data as Gamma