# Create a numeric vector with the identifiers of the participants of your surveyparticipants_1<-c(2,3,5,7,11,13,17)# Check what type of values R thinks the vector consists ofclass(participants_1)
1
## [1] "numeric"
# Transform the numeric vector to a factor vectorparticipants_2<-factor(participants_1)# Check what type of values R thinks the vector consists of nowclass(participants_2)
1
## [1] "factor"
Ordinal variables in R
# Create a vector of temperature observationstemperature_vector<-c('High','Low','High','Low','Medium')temperature_vector
1
## [1] "High" "Low" "High" "Low" "Medium"
# Specify that they are ordinal variables with the given levelsfactor_temperature_vector<-factor(temperature_vector,order=TRUE,levels=c('Low','Medium','High'))factor_temperature_vector
12
## [1] High Low High Low Medium
## Levels: Low < Medium < High
Interval and Ratio variables in R
# Assign to the variable 'longitudes' a vector with the longitudes# This is an interval variable.longitudes<-c(10,20,30,40)# Assign the times it takes for an athlete to run 100 meters to the variable 'chronos'# This is a ratio variable.chronos<-c(10.60,10.12,9.58,11.1)
## subject condition verbal_memory_baseline
## Min. : 1.00 Length:40 Min. :75.00
## 1st Qu.:10.75 Class :character 1st Qu.:85.00
## Median :20.50 Mode :character Median :91.00
## Mean :20.50 Mean :89.75
## 3rd Qu.:30.25 3rd Qu.:95.00
## Max. :40.00 Max. :98.00
## visual_memory_baseline visual.motor_speed_baseline reaction_time_baseline
## Min. :59.00 Min. :26.29 Min. :0.4200
## 1st Qu.:68.75 1st Qu.:31.59 1st Qu.:0.5675
## Median :75.00 Median :33.50 Median :0.6500
## Mean :74.88 Mean :34.03 Mean :0.6670
## 3rd Qu.:81.25 3rd Qu.:36.44 3rd Qu.:0.7325
## Max. :91.00 Max. :41.87 Max. :1.2000
## impulse_control_baseline total_symptom_baseline verbal_memory_retest
## Min. : 2.000 Min. :0.00 Min. :59
## 1st Qu.: 7.000 1st Qu.:0.00 1st Qu.:74
## Median : 8.500 Median :0.00 Median :85
## Mean : 8.275 Mean :0.05 Mean :82
## 3rd Qu.:10.000 3rd Qu.:0.00 3rd Qu.:91
## Max. :12.000 Max. :1.00 Max. :97
## visual_memory_retest visual.motor_speed_retest reaction_time_retest
## Min. :54.00 Min. :20.15 Min. :0.1900
## 1st Qu.:66.75 1st Qu.:30.33 1st Qu.:0.5575
## Median :72.00 Median :35.15 Median :0.6500
## Mean :71.90 Mean :35.83 Mean :0.6730
## 3rd Qu.:79.00 3rd Qu.:39.41 3rd Qu.:0.7325
## Max. :86.00 Max. :60.77 Max. :1.3000
## impulse_control_retest total_symptom_retest
## Min. : 1.00 Min. : 0.00
## 1st Qu.: 5.00 1st Qu.: 0.00
## Median : 7.00 Median : 7.00
## Mean : 6.75 Mean :13.88
## 3rd Qu.: 9.00 3rd Qu.:27.00
## Max. :12.00 Max. :43.00
# Select the variable 'verbal_memory_baseline' from the 'impact' data.frame and assign it to the variable 'verbal_baseline'verbal_baseline<-impact$verbal_memory_baselineverbal_baseline
# Plot a histogram of the verbal_baseline variable that you have just createdhist(verbal_baseline,main='Distribution of verbal memory baseline scores',xlab='score',ylab='frequency')
Let us go wine tasting (red wine)
# Read in the data set and assign to the objectred_wine_data<-readWorksheetFromFile('A Hands-on Introduction to Statistics with R.xls',sheet='red_wine_data',header=TRUE,startCol=1,startRow=1)# This will print the data set in the consolehead(red_wine_data)
1234567
## subject condition Ratings
## 1 1 Australia 77
## 2 2 Australia 82
## 3 3 Australia 75
## 4 4 Australia 92
## 5 5 Australia 83
## 6 6 Australia 75
# Print basic statistical properties of the red_wine_data data.frame. Use the describe() function#describe(red_wine_data)summary(red_wine_data)
1234567
## subject condition Ratings
## Min. : 1.0 Length:400 Min. :39.00
## 1st Qu.:100.8 Class :character 1st Qu.:67.00
## Median :200.5 Mode :character Median :74.00
## Mean :200.5 Mean :73.94
## 3rd Qu.:300.2 3rd Qu.:81.00
## Max. :400.0 Max. :98.00
# Split the data.frame in subsets for each country and assign these subsets to the variables belowred_usa<-subset(red_wine_data,red_wine_data$condition=='USA')red_france<-subset(red_wine_data,red_wine_data$condition=='France')red_australia<-subset(red_wine_data,red_wine_data$condition=='Australia')red_argentina<-subset(red_wine_data,red_wine_data$condition=='Argentina')# Select only the Ratings variable for each of these subsets and assign them to the variables belowred_ratings_usa<-red_usa$Ratingsred_ratings_france<-red_france$Ratingsred_ratings_australia<-red_australia$Ratingsred_ratings_argentina<-red_argentina$Ratings## Create a 2 by 2 matrix of histograms# Organize the histograms so that they are structured in a 2 by 2 matrix.par(mfrow=c(2,2))# Plot four histograms, one for each subjecthist(red_ratings_usa)hist(red_ratings_france)hist(red_ratings_australia)hist(red_ratings_argentina)
Let us go wine tasting (white wine)
# Read in the data set and assign to the objectwhite_wine_data<-readWorksheetFromFile('A Hands-on Introduction to Statistics with R.xls',sheet='white_wine_data',header=TRUE,startCol=1,startRow=1)# This will print the data set in the consolehead(white_wine_data)
1234567
## condition Ratings
## 1 Australia 85.6
## 2 Australia 85.6
## 3 Australia 85.6
## 4 Australia 85.6
## 5 Australia 85.6
## 6 Australia 85.6
# Assign the scores for each country to a variablewhite_ratings_france<-subset(white_wine_data,white_wine_data$condition=='France')$Ratingswhite_ratings_argentina<-subset(white_wine_data,white_wine_data$condition=='Argentina')$Ratingswhite_ratings_australia<-subset(white_wine_data,white_wine_data$condition=='Australia')$Ratingswhite_ratings_usa<-subset(white_wine_data,white_wine_data$condition=='USA')$Ratings# Plot a histogram for each of the countries# Organize the histograms so that they are structured in a 2 by 2 matrix.par(mfrow=c(2,2))hist(white_ratings_usa,main='USA white ratings',xlab='score')hist(white_ratings_australia,main='Australia white ratings',xlab='score')hist(white_ratings_argentina,main='Argentina white ratings',xlab='score')hist(white_ratings_france,main='France white ratings',xlab='score')
# Read in the data set and assign to the objectratings_australia<-readWorksheetFromFile('A Hands-on Introduction to Statistics with R.xls',sheet='ratings_australia',header=TRUE,startCol=1,startRow=1)ratings_australia<-as.vector(ratings_australia$ratings_australia)
# Print the ratings for the Australian red wineratings_australia
# Convert these ratings to Z-scores. Use the `scale()` functionz_scores_australia<-scale(ratings_australia)# Plot both the original data and the scaled data in histograms next to each otherpar(mfrow=c(1,2))# Plot the histogram for the original scoreshist(ratings_australia)# Plot the histogram for the Z-scoreshist(z_scores_australia)
# create a vector that contains the Fibonacci elementsfibonacci<-c(0,1,1,2,3,5,8,13)# calculate the mean manually. Use the sum() and the length() functionsmean<-sum(fibonacci)/length(fibonacci)mean
1
## [1] 4.125
# calculate the mean the easy waymean_check<-mean(fibonacci)mean_check
1
## [1] 4.125
Setting up histograms
# Read in the data set and assign to the objectwine_data<-readWorksheetFromFile('A Hands-on Introduction to Statistics with R.xls',sheet='wine_data',header=TRUE,startCol=1,startRow=1)head(wine_data)
1234567
## condition Ratings
## 1 Red 77
## 2 Red 82
## 3 Red 75
## 4 Red 92
## 5 Red 83
## 6 Red 75
# create the two subsetsred_wine<-subset(wine_data,wine_data$condition=='Red')white_wine<-subset(wine_data,wine_data$condition=='White')# Plot the histograms of the ratings of both subsetspar(mfrow=c(1,2))hist(red_wine$Ratings,main='Shiraz',xlab='Ratings')hist(white_wine$Ratings,main='Pinot Grigio',xlab='Ratings')
Robustness to outliers
# create the outlier and add it to the datasetoutlier<-data.frame(condition='Red',Ratings=0)red_wine_extreme<-rbind(red_wine,outlier)# calculate the difference in means and display it afterwardsdiff_means<-mean(red_wine$Ratings)-mean(red_wine_extreme$Ratings)diff_means
1
## [1] 0.8093069
# calculate the difference in medians and display it afterwardsdiff_medians<-median(red_wine$Ratings)-median(red_wine_extreme$Ratings)diff_medians
Michael Jordan’s first NBA season - Global overview
# Read in the data set and assign to the objectdata_jordan<-readWorksheetFromFile('A Hands-on Introduction to Statistics with R.xls',sheet='data_jordan',header=TRUE,startCol=1,startRow=1)head(data_jordan)
# Make a scatterplot of the data on which a horizontal line with height equal to the mean is drawn.mean_jordan<-mean(data_jordan$points)plot(data_jordan$game,data_jordan$points,main='1st NBA season of Michael Jordan')abline(h=mean_jordan)
Michael Jordan’s first NBA season - Calculate the variance manually
# Calculate the differences with respect to the mean diff<-data_jordan$points-mean(data_jordan$points)# Calculate the squared differencessquared_diff<-diff^2# Combine all pieces of the puzzle in order to acquire the variancevariance<-sum(squared_diff)/(length(data_jordan$points)-1)variance
1
## [1] 66.73427
# Compare your result to the correct solution. You can find the correct solution by calculating it with the `var()` function.var(data_jordan$points)