#
#
############### Some Descriptive Statistics & Associated Graphing ###############
#
# This script assumes you have worked through all the previous notes from
# the web page and you have downloaded, installed, and updated all available
# R packages.
# If an additional library is needed, it will be listed in the script and loaded.
library(Rcmdr)
# Start by reading the data file into R from the web address.
example3 <- read.table("http://researchsupport.unt.edu/class/Jon/R_SC/Module3/ExampleData3.txt",
header=TRUE, sep="", na.strings="NA", dec=".", strip.white=TRUE)
summary(example3)
# Get rid of the goofy characters in the name of the age variable; by
# using the 'names' function to change whatever the first name [1] of
# example3 is (in this case: ï..age) to "age".
names(example3)
names(example3) [1] <- "age"
names(example3)
# Attach the data so we can refer to variable names directly (without $).
attach(example3)
# Get some quick summaries of each variable in the data.
# WHEN USED WITH A SINGLE VARIABLE OR A DATA FRAME: The 'summary' function
# returns some basic descriptive statistics for each variable: minimum,
# 1st quantile, median, mean, 3rd quantile,maximum, and number of NA (missing
# data) if present. If the variable is nominal; then 'summary' returns the
# number of cases for each value of the variable (and number of NA if present).
summary(gender)
summary(age)
summary(example3)
# Mean of age for each category of gender.
tapply(example3$age, list(gender=example3$gender), mean, na.rm=TRUE)
# Summary of age BY gender.
numSummary(example3[,"age"], groups=example3$gender, statistics=c("mean",
"sd", "quantiles"), quantiles=c(0,.25,.5,.75,1))
# Count missing values in each vector/variable of example3.
sapply(example3, function(x)(sum(is.na(x))))
# Mean, variance, standard deviation, sum and product of age.
mean(age)
var(age)
sd(age)
sum(age)
prod(age)
# Number of rows/cases of example3 (data frame) and age (vector).
nrow(example3)
length(age)
# Median of age.
median(age)
# Trimmed mean: 20% trim of age, remove any missing data (NA).
mean(age, tr=.2, na.rm=TRUE)
# Weighted mean: mean of age with weights from marital.
weighted.mean(age, marital)
# Weighted standard deviation function -- examples of how to write basic functions:
weighted.sd <- function(x, w){
sum.w <- sum(w)
sum.w2 <- sum(w^2)
mean.w <- sum(x * w) / sum(w)
x.sd.w<-sqrt((sum.w / (sum.w^2 - sum.w2)) * sum(w * (x - mean.w)^2))
return(x.sd.w)
}
weighted.sd(age, marital)
# Skewness & Kurtosis functions.
skew <- function(x){
((sum(((x - mean(x))/(sd(x)))^3))/(length(x) - 1))
}
kurt <- function(x,k.type){
ifelse(k.type == 2,
((sum(((x - mean(x))/(sd(x)))^4))/(length(x) - 1)),
(((sum(((x - mean(x))/(sd(x)))^4))/(length(x) - 1)) - 3)
)
}
skew(age)
kurt(age,1)
kurt(age,2)
##############################################
##### Some basic/routine graphics. #####
# If you ever want to change the color of something; you'll likely want to take a
# look at the color names available...there are something like 657 colors in R.
colors()
# Histogram of age (how it's suppose to look). Keep in mind, a later tutorial covers
# how to manipulate the x- and y- axis scales, labels/titles, colors (col), and other graphical
# parameters.
hist(age)
hist(age, col = "lightblue1")
# Histogram/bar graph (how it's NOT suppose to look).
barplot(age)
# Bar chart for gender (remember 3202 females & 3198 males; so the plot is
# rather boring).
plot(gender)
# Convert gender to a numeric vector version of gender. Changing
# the values from m:f to 1:2 for males & females if you prefer.
genderRC <- as.numeric(gender)
plot(genderRC)
# Boxplot of age.
boxplot(age)
# Basic scatter plot.
plot(age, income)
# Another scatter plot.
sunflowerplot(age, income)
# Sub-group scatter plot; income predicted by age with separate plots for
# males and females. *Note the convention (in graphics and model specification
# when we get to it) of using "y ~ x..." to say "y predicted by x...".
coplot(income ~ age | gender)
# Interaction plot (typically used with ANOVA); looking at the interaction of
# gender and marital (status: 1 = married, 0 = not married) on (mean) age.
interaction.plot(gender, marital, age)
# Create a data frame of age & income & employ (years employed); demographic variables.
demog <- data.frame(age, income, employ)
# Simple scatter plot matrix.
pairs(demog)
###############################################################################
# One of the really strong benefits to using R is the ability to
# graph just about anything and also the ability to modify and
# specify everything in a graph.
# As we will see; the graphics from this script represent
# just the very tip of the iceberg when it comes to complexity
# in graphing with R.
# I may add to this at a later date (as is always the case...); but for
# now I think we can move forward.
# End: minor updates, July 18, 2012.