Question 1 Directions: Using a for loop, write a function to calculate the number of zeroes in a numeric vector. Before entering the loop, set up a counter variable counter <- 0. Inside the loop, add 1 to counter each time you have a zero in the matrix. Finally, use return(counter) for the output.
counter <- 0
num_vector<-c(0,0,0,0,3,4,5,3,6,7,5,3,2,4,2,1,4,6,9,0,3,4,7,8,0,2,34,56,6,9,0,2,3,56,7,8,9,0,2,2,4,4,6,7,9,9,0,0,8,7,5)
for (i in 1:length(num_vector)) {
if(num_vector[i]==0) {
counter <- counter+1
}
}
print(counter)
## [1] 10
Question 4a-d Directions:
- Simulate a dataset with 3 groups of data, each group drawn from a distribution with a different mean. The final data frame should have 1 column for group and 1 column for the response variable.
- Write a custom function that 1) reshuffles the response variable, and 2) calculates the mean of each group in the reshuffled data. Store the means in a vector of length 3.
- Use a for loop to repeat the function in b 100 times. Store the results in a data frame that has 1 column indicating the replicate number and 1 column for each new group mean, for a total of 4 columns.
- Use qplot() to create a histogram of the means for each reshuffled group. Or, if you want a challenge, use ggplot() to overlay all 3 histograms in the same figure. How do the distributions of reshuffled means compare to the original means?
# Part a, simulating a data set
three_groups <- c(rep("group_1",4),rep("group_2",5), rep("group_3",3))
print(three_groups)
## [1] "group_1" "group_1" "group_1" "group_1" "group_2" "group_2" "group_2"
## [8] "group_2" "group_2" "group_3" "group_3" "group_3"
response_variable <- c(runif(4)+3, runif(5)+7, runif(3)-0.5)
print(response_variable)
## [1] 3.21041973 3.77519260 3.83654354 3.62866688 7.59105523 7.57911946
## [7] 7.46562993 7.70913067 7.43242684 0.40827959 -0.06492981 0.07491984
df<- data.frame(group=three_groups,res=response_variable)
print(df)
## group res
## 1 group_1 3.21041973
## 2 group_1 3.77519260
## 3 group_1 3.83654354
## 4 group_1 3.62866688
## 5 group_2 7.59105523
## 6 group_2 7.57911946
## 7 group_2 7.46562993
## 8 group_2 7.70913067
## 9 group_2 7.43242684
## 10 group_3 0.40827959
## 11 group_3 -0.06492981
## 12 group_3 0.07491984
# part b, custom function
df_sim <- df
df_sim$res <- sample(df_sim$res)
print(df_sim)
## group res
## 1 group_1 3.77519260
## 2 group_1 7.43242684
## 3 group_1 0.40827959
## 4 group_1 3.21041973
## 5 group_2 7.59105523
## 6 group_2 3.83654354
## 7 group_2 -0.06492981
## 8 group_2 3.62866688
## 9 group_2 0.07491984
## 10 group_3 7.70913067
## 11 group_3 7.57911946
## 12 group_3 7.46562993
sim <- tapply(df_sim$res,df$group,mean)
print(sim)
## group_1 group_2 group_3
## 3.706580 3.013251 7.584627
sim_vector <- as.vector(sim)
print(sim_vector)
## [1] 3.706580 3.013251 7.584627
# Part c, for loop
##################################################
# function: shuffle_data
# Purpose: use for loop to repeat function 100 times
# input: means of shuffled response variables in vector form
# output: 4-column data frame (replicate #, group mean 1, group mean2, group mean 3)
#-------------------------------------------------
shuffle_data <- function(data=NULL) {
data$res <- sample(data$res)
return(data)
}
shuffle_data(df)
## group res
## 1 group_1 7.57911946
## 2 group_1 7.43242684
## 3 group_1 3.77519260
## 4 group_1 3.62866688
## 5 group_2 7.59105523
## 6 group_2 7.46562993
## 7 group_2 0.40827959
## 8 group_2 3.83654354
## 9 group_2 7.70913067
## 10 group_3 0.07491984
## 11 group_3 3.21041973
## 12 group_3 -0.06492981
get_metric <- function(data=NULL) {
. <- summary(aov(data=data,res~group))
p_val <- .[[1]][["Pr(>F)"]][1]
return(p_val)
}
get_pval <- function(z=NULL) {
if(is.null(z)){
z <- list(x_obs=runif(1),x_sim=runif(1000))}
p_lower <- mean(z[[2]]<=z[[1]])
p_upper <- mean(z[[2]]>=z[[1]])
return(c(pl=p_lower,pu=p_upper))
}
n_sim <- 100
x_sim <- rep(NA,n_sim) # vector of simulated slopes
df <- df
x_obs <- get_metric(df)
for (i in seq_len(n_sim)) {
x_sim[i] <- get_metric(shuffle_data(df))
}
head(x_sim)
## [1] 0.8454053 0.5278592 0.5680170 0.2110502 0.1069622 0.6178352
p_value <- list(x_obs,x_sim)
get_pval(p_value)
## pl pu
## 0 1
library(ggplot2)
plot_ran_test <- function(z=NULL) {
if(is.null(z)){
z <- list(rnorm(1),rnorm(1000)) }
df <- data.frame(ID=seq_along(z[[2]]),sim_x=z[[2]])
p1 <- ggplot(data=df) +
aes(x=sim_x)
p1 + geom_histogram(aes(fill=I("goldenrod"),
color=I("black"))) +
geom_vline(aes(xintercept=z[[1]],
col="blue"))
}
plot_ran_test(p_value)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.