forked from kristineccles/Introduction_to_R
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathintro_to_R_exercise1.R
92 lines (71 loc) · 2.69 KB
/
intro_to_R_exercise1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#############################################################
# Exercise 1 - Introduction to R
# By: Kristin Eccles
# Written in R 3.5.0
#############################################################
# Load Libraries
# Requires athe ggplot2 package that is not part of base installation
# To install the package, uncomment the following line
# install.packages("ggplot2", dependencies=TRUE)
library(ggplot2)
# Load data
sturgeon= read.csv("sturgeon.csv")
#############################################################
# Check to see if we have emissing data
is.na(sturgeon)
# Summarize the contents of the sturgeon dataframe
summary(sturgeon)
# you will also see missing data in this summary
# Clean the data (if necessary)
# create new dataset without missing data
sturgeon_clean=na.omit(sturgeon)
#############################################################
# Visualize the data
# Make a histogram of the variable fklngth in the sturgeon
# this uses the base R plotting
hist(sturgeon_clean$fklngth)
# Make variables in the sturgeon data.frame directly callable by attaching
attach(sturgeon_clean)
# Redo the same histogram as before
ls()
hist(fklngth)
# use "sturgeon" dataframe to make plot called mygraph
# and define x axis as representing fklngth
plot1 <- ggplot(sturgeon_clean) +
geom_histogram(aes(x = fklngth, y = ..density..),
binwidth = 10, fill = "grey", color = "black")
plot1
# Try changing the binwidth- what happens?
# Data distribution fklngth by subsets of sex and year
# split previous graph per year (rows) and sex (columns)
plot2 <- ggplot(sturgeon_clean) +
geom_histogram(aes(x = fklngth, y = ..density..),
binwidth = 5, fill = "grey", color = "black")+
facet_grid(sex ~ year)
plot2
# QQ plot de fklngth
qqnorm(fklngth)
qqline(fklngth)
# Are there outliers?
# Wilks-Shapiro test of normality on fklngth
shapiro.test(fklngth)
# Boxplot of fklngth by sex, with whiskers
# base graphics
boxplot(fklngth ~ sex, notch = TRUE)
#ggplot2 version
plot3<-ggplot(sturgeon_clean, aes(x=sex, y=fklngth))
plot3+ geom_boxplot()
# Scatterplot of fklngth as a function of age
# base graphics
plot(fklngth ~ age)
#ggplot2 version
plot4<-ggplot(sturgeon_clean, aes(x=age, y=fklngth))
plot4+ geom_point()
# Matrix of scatterplots of all pairs of variables in dataframe sturgeon, with lowess trace
pairs(sturgeon, panel = panel.smooth)
# Create a subset with only females captured in 1978
sturgeon.female.1978 <- subset(sturgeon_clean, sex == "FEMALE" &
year == "1978")
# Create a histogram of fklngth from a subset for females in 1979 and 1980
mysubset = subset(sturgeon_clean, sex == "FEMALE" & (year == "1979" | year == "1980"))
hist(mysubset$fklngth)