STATS HOMEWORK USING R:
#Lab 6
#274-Wilcox (Fall 2019)
#Name:
#Student ID:
rm(list=ls())
source(‘Rallfun-v33.txt’)
#####
#Please name your variables according to the question, otherwise points will be deducted even if the command is correct
#####
#Submit any histogram/density plots
#####
#PART 1-Sampling t-distribution
#1.1) Use the scan( ) function to read in Lab6hw.txt into a variable called pop2. Create a histogram of the data AND describe the distribution.
#1.2) Use the function CI_var_known we wrote in class to get the 95% confidence interval.
#1.3) Use the function CI_var_unknown we wrote in class to get the 95% confidence interval.
#1.4) Draw 500 samples of size 25 from pop2 and put these into a matrix called tsams.
#1.5) Compute the T-scores for each sample and store these into a new vector called tscores.
#1.6) Using a histogram, plot the distribution of T-scores (tscores). Describe the distribution.
################################################
#PART 2-Simulated t-distribution
#2.1) Simulate a distribution of 500 t-scores with 24 degrees of freedom into a variable called tsim using the rt() function.
#2.2) Using a histogram, plot the distribution of simulated T-scores (tsim). Describe the distribution.
#2.3) Using an overlaid density plot, simultaneously show the distributions of the sampled t-scores (tscores) and simulated t-scores (tsim)
#2.4) Based on the plot in 2.3, how do the distributions compare
################################################
#PART 3-Implications for Confidence Intervals
#3.1)When computing confidence intervals, the critical quantile is always derived from the theoretical (simulated) t-distribution, which assumes normality.
# As you discovered in 2.4, the actual distribution of t-scores can be very different from the theoretical one, depending on the population distribution.
# If we ignore the fact that the population may be non-normal, and compute confidence intervals in the usual way,
# what will happen to the probability coverage (i.e. accuracy) of the confidence intervals?
—————————————————————————-
LECTURE NOTES FOR LAB 6
#Lab 6-Contents
#0. Quick review of What we’ve been doing
#1. Confidence Intervals when the Population Variance is Known
#2. Confidence Intervals when the Population Variance is Unknown
#3. Critical values and sample size
#4. Writing Functions to Calculate CI
#5. The T-Distribution
#———————————————————————————
# 0. Quick Review
#———————————————————————————
#By now, you have a bunch of tools in your toolbox to handle data
#Below is a listing of some of the functions we’ve used
#=======================#===============================#===========================================================================#
# function # Meaning # Example #
#=======================#===============================#===========================================================================#
# c(#,#,…)# Collection of Numbers# vect1=c(1,2,3,4)#
## Usually to assign to a vector # #
#———————–#——————————-#—————————————————————————#
# cbind()# Combine vectors by Columns # newC=cbind(vect1, vect2)#
# rbind() # Combine vectors by Rows# newR=rbind(vect1, vect2)#
#———————–#——————————-#—————————————————————————#
# dat[i,j]# Retrieving values from data # dat[3,25] #Row 3, Column 25#
## i=row, j=column# dat[ ,”bmi”] #All Rows, variable bmi#
#———————–#——————————-#—————————————————————————#
# fix()# Visualize data # fix(dat)
#———————–#——————————-#—————————————————————————#
# read.table()# Read in external text file # mydat=read.table(file.choose(), header=T)#
# scan() # Read data into a vector
#———————–#——————————-#—————————————————————————#
# source()# Read in Dr. Wilcox Source Code# source(file.choose())#
#———————–#——————————-#—————————————————————————#
# dim()# Dimensions of Data (rows,cols)# dim(mydata)#
# names()# Variable names in Data# names(mydata)#
# str()# Data properties# str(mydata)#
# head()# Show first 6 rows# head(mydata)#
# tail()# Show last 6 rows# tail(mydata)#
#———————–#——————————-#—————————————————————————#
# which()# Used to identify rows in a # rows=which(mydata$bmi < 15)#
## data file. Often for recoding # mydata[rows, “bmi”] = NA#
#———————–#——————————-#—————————————————————————#
# min()# Min value in variable# min(mydata$bmi, na.rm=T)#
# max()# Max value in variable# max(mydata$bmi, na.rm=T)#
# range()# Min and Max value in variable # range(mydata$bmi, na.rm=T)#
#———————–#——————————-#—————————————————————————#
# table()# Frequencies of variable# table(mydata$race)#
#———————–#——————————-#—————————————————————————#
# mean()# mean of variable# mean(mydata$bmi, na.rm=T)#
# median()# median of variable# median(mydata$bmi, na.rm=T)#
# tmean()*# Trimmed Mean of Variable# tmean(mydata$bmi, tr=0.2, na.rm=T)#
# sd()# Standard Deviation of Variable# sd(mydata$bmi, na.rm=T)#
# var()# Variance of variable# var(mydata$bmi, na.rm=T)#
#———————–#——————————-#—————————————————————————#
# boxplot()# Boxplot of Variable# boxplot(mydata$bmi)#
# hist()# Histogram of Variable# hist(mydata$bmi)#
# density()# Density plot# plot(density(mydata$bmi, na.rm=T))#
#———————–#——————————-#—————————————————————————#
# dbinom(k,n,p)# Prob of EXACTLY k Successes # dbinom(5,20,0.45) #
### Prob of EXACTLY 5 success out of 20 given prob of 0.45#
# pbinom(k,n,p)# Prob of <= k Success# pbinom(5,20,0.45) #
###Prob of <= 5 success out of 20 given prob of 0.45#
#———————–#——————————-#—————————————————————————#
# pnorm(val,mean,sd) # Prob of < val on normal dist # pnorm(50,65,5) #Prob of scoring <50 on a norm dist of mean 65 and sd 5 #
# qnorm(p,mean,sd)# Value where the Prob of val=p # qnorm(0.2,65,5) #The score where the prob for < score is 0.20#
#———————–#——————————-#—————————————————————————#
# rnorm(N,mean,sd)# Simulate normal distribution # pop=rnorm(5000,0,1) #
###a random variable pop with N=5000 and a mean of 0, sd of 1 #
# runif(N)# Simulate uniform distribution # popunif=runif(5000) #
###a random variable popunif with N=5000 distributed uniformly#
# cnorm(n, epsilon, k)* # Contaminated Normal dist# cpop=cnorm(5000, 0.4, k=3) #
### A standard normal (mean=0, sd=1) for 1-epsilon (60%) of the data #
### A normal of mean=0 and sd=k (eg. 3) for epsilon (40%) of the data #
#———————–#——————————-#—————————————————————————#
# sample(data, size) # Take a random sample of data # x=sample(pop, size=50, replace=T) #
### Take a sample from pop of size 50 and store in x#
#———————–#——————————-#—————————————————————————#
# matrix(,ncol=#,nrow=#)# Create a blank matrix# mysams=matrix(, ncol=200, nrow=50) #
###Create a blank matrix called mysams of 200 columns and 50 rows#
#———————–#——————————-#—————————————————————————#
# for (i in START:END) {# For Loop to loop over numbers # for (i in 1:200) {#
# }## mysams[,i]=sample(pop, size=50, replace=T)#
### }#
#———————–#——————————-#—————————————————————————#
# apply(data,2,FUN)# Apply a FUNction to columns # sammeans=apply(mysams, 2, FUN=mean) #
###Store column means of mysams in variable called sammeans#
#———————–#——————————-#—————————————————————————#
#NOTE: Functions with a star (*) after are from Dr. Wilcox’s source code and must have the source code read in using the source() command first
#———————————————————————————
# 1. Confidence Intervals when the Population Variance is known.
#———————————————————————————
#The formula for a CI is:
# LB: mean – c * (sd/sqrt(N))
# UB: mean + c * (sd/sqrt(N))
#And we can compute c based upon the quantiles
#from a standard normal distribution where
# c = qnorm(1-(alpha/2)).
# Remember for an 80% CI, alpha is 1-0.8 = 0.2;
# for a 65% CI, alpha = 1-0.65 = 0.35 etc.
#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#
# EXERCISE 1-1: Compute a 95% confidence interval for mean=25,
# population variance=25, N=20
#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#
mean1=25
var1=25
N=20
sd1=sqrt(var1)
alpha=1-.95
c= qnorm(1-(alpha/2))
LB= mean1 – c * (sd1/sqrt(N));LB
UB= mean1 + c * (sd1/sqrt(N));UB
#??????????????????????????????????????????????????????????????#
#Thought Question 1: For the above, why do we use alpha/2 to
# find the quantile. Or rather, why am I using qnorm(0.975) to
# find c rather than qnorm(0.95)
#??????????????????????????????????????????????????????????????#
#———————————————————————————
# 2. Confidence Intervals when the Population Variance is UNKNOWN.
#———————————————————————————
#In reality, rarely do we know the population variance.
#Instead what we have is our sample variance.
#While the formula for a CI is the same as we used before:
# LB: mean – c * (sd/sqrt(N))
# UB: mean + c * (sd/sqrt(N))
#However, the way we calculate our critical value ‘c’ will now
# be from the quantiles of a T-distribution rather than a normal.
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
# T Dist Probability (T <= t): pt(q, df)
# T Dist Quantile:qt(p, df)
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
# The function pt() computes the probability that
# T is <= some quantile q given degrees of freedom (df)
# The function qt() computes the quantile for T at the probability
# value of p given degrees of freedom (df)
# What are degrees of freedom?
#For now, just know that df=N-1
#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#
#Exercise 2-1:
# A) Calculate the probabilty that T is <= 1 given 10 df
# B) Calculate the 0.975 quantile with 10 df
# C) What is the sample size for 10 df
#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#
#A)
pt(1,10)
#B)
qt(0.975,10)
#C)
N=11
#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#
#EXERCISE 2-2:
# A) Compute a 95% confidence interval for mean=25,
# SAMPLE variance=25, N=20
# B) How does the Lower and Upper Bounds compare to those
# found in Exercise 2-1 then the Population variance was known
#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#
#A)
mean2=25
var2=25
N2=20
sd2=sqrt(var2)
alpha2=1-.95
df=N2-1
c2= qt(1-(alpha2/2),df)
LB2= mean2 – c2 * (sd2/sqrt(N2));LB2
UB2= mean2 + c2 * (sd2/sqrt(N2));UB2
#B)
#For the normal
# LB=22.80869
#UB=27.19131
#For the T-dis we got
#LB2=22.65993
#UB2=27.34007
#———————————————————————————
# 3. Critical Values and Sample Size
#———————————————————————————
# Let’s think more about, why the Confidence interval was wider
# when the sample variance was unknown?
# We’ll start by computing the critical values for a variety
# of sample sizes for a 95% CI
# When the population variance is known, we use the qnorm() function
# to find the critical value
qnorm(0.975)
# Which is 1.959964. Notice that it doesn’t matter what sample size I have, the critical value is the same.
# I’m going to use a loop to loop over values of
# N=20, 50, 100, 200, 1000, and 10000
# to see what the critical values would be at various sample sizes
# when the population variance is unknown
for (ii in c(25,50,100,200,1000,10000)) {
qtval=qt(0.975, ii-1)
print(qtval)
}
#??????????????????????????????????????????????????????????????#
#Thought Question 2: What do you notice about the relationship
# between the critical values and sample size?
#??????????????????????????????????????????????????????????????#
#———————————————————————————
# 4. Writing Functions to Calcuate the Confidence Intervals
#———————————————————————————
#To calculate the confidence interval easier, we can write our own function which figures out the CI for us
#I’ll start by writing the function for the confidence interval when the population variance is Unknown
CI_var_unknown = function(data, ci) {
#data=c(1,2,3,4,5,6)
#ci=.95
mean=mean(data)
N=length(data)
df=N-1
sd=sd(data)
alpha=1-ci
c=qt(1-(alpha/2), df)
LB=mean – c * (sd/sqrt(N))
UB=mean + c * (sd/sqrt(N))
output=list(LB, UB)
names(output)=c(“Lower Bound”, “Upper Bound”)
print(output)
}
#So, now if I have data from a normal distribution (x) of mean=0 and sd=1:
x=rnorm(500, 0, 1)
# I can use my new function to compute the 95% confidence interval
CI_var_unknown(data=x, ci=0.95)
#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#
#Exercise 5-1:
# A) Write a function called CI_var_known for computing the confidence intervals when the population
# variance is known
# B) Compute the 90% CI for the data x using your new function.
#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#
#A)
CI_var_known = function(data, ci) {
#data=c(1,2,3,4,5,6)
#ci=.95
mean=mean(data)
N=length(data)
sd=sd(data)
alpha=1-ci
c=qnorm(1-(alpha/2))
LB=mean – c * (sd/sqrt(N))
UB=mean + c * (sd/sqrt(N))
output=list(LB, UB)
names(output)=c(“Lower Bound”, “Upper Bound”)
print(output)
}
#B)
CI_var_known(data=x, ci=0.90)
#———————————————————————————
# 5. The T-Distribution
#———————————————————————————
#Let’s start to understand the T distribution by reading data
# that we will use as our population
#The scan() function is used to read in external text files as vectors
#Let’s read in Lab6.txt file
pop=scan(‘Lab6.txt’)
#And a quick look at the data
hist(pop); mean(pop); sd(pop)
# There is a formula for a T-score such that:
# T = (SampleMean – PopMean)/(SampleSD/sqrt(SampleN))
#What we will do is take multiple samples from our population,
#compute the T score for each sample, and then compare
#to a random T distribution
#Let’s take 500 samples of size 20
sams = matrix(, ncol=500, nrow=20)
for (ii in 1:500) {
sams[,ii] = sample(pop, size=20, replace=TRUE)
}
#’sams’ now contains 500 random samples from the population,
#now I will use the T-score formula to compute a T score for each sample
#I will store my 500 T scores into a numeric vector called samtscores
samtscores=numeric(500)
for (jj in 1:500) {
samtscores[jj]= ( mean(sams[,jj]) – mean(pop) ) / ( sd(sams[,jj]) / sqrt(20) )
}
#Let’s take a quick look at the distribution of the T-scores
hist(samtscores)
#Lastly, I’d like to compare what we have to a simulated T -distribution
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
# Random T Score: rt(n, df)
#
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
# The function rt() will create n random T scores from a t-distribution
# with n-1 df
simt=rt(500, 19)
# We can graphically compare the two distributions by using an overlaid density plot
plot(density(samtscores))
lines(density(simt), col=”red”)
#??????????????????????????????????????????????????????????????#
#Thought Question 3: How does our simulated t scores compare
# to the actual t-scores from our samples?
#??????????????????????????????????????????????????????????????#


0 comments