Big Data Analysis with Revolution R Enterprise
Foreword
- Output options: the ‘tango’ syntax and the ‘readable’ theme.
- Snippets only.
Documentation¶
- Microsoft R Server: previously called Revolution R Enterprise for Hadoop, Linux and Teradata and included new Microsoft enterprise support and purchasing options. Microsoft R Server was further made available to students through the Microsoft DreamSpark programme.
- Microsoft R Server Developer Edition: a gratis version for developers that with a feature set akin to the commercial edition.
- Microsoft Data Science Virtual Machine: an analytics tool developed by the Revolution Analytics division premiered in January 2015.
- Microsoft R Open: a rebranded version of Revolution R Open.
Introduction¶
Importing data with rxImport
function
# Declare the file paths for the csv and xdf files
# find the path or directory where the file is, load the path variable
myAirlineCsv <- file.path(rxGetOption('sampleDataDir'), '2007_subset.csv')
# fin the data in this directory and load the data variable
myAirlineXdf <- '2007_subset.xdf'
# Use rxImport to import the data into xdf format
# rxImport(inData = myAirlineCsv, outFile = myAirlineXdf, overwrite = TRUE)
# or
# function within a function for more stats
system.time(rxImport(inData = myAirlineCsv,
outFile = myAirlineXdf,
overwrite = TRUE))
list.files()
Functions for summarizing data
# Get basic information about your data
rxGetInfo(data = myAirlineXdf,
getVarInfo = TRUE,
numRows = 10)
# Summarize the variables corresponding to actual elapsed time, time in the air, departure delay, flight Distance
rxSummary(formula = ~ ActualElapsedTime + AirTime + DepDelay + Distance,
data = myAirlineXdf)
# Histogram of departure delays
rxHistogram(formula = ~DepDelay,
data = myAirlineXdf)
# Use parameters similar to a regular histogram to zero in on the interesting area
rxHistogram(formula = ~DepDelay,
data = myAirlineXdf,
xAxisMinMax = c(-100, 400),
numBreaks = 500,
xNumTicks = 10)
Creating new variables using rxDataStep
# Calculate an additional variable: airspeed (distance traveled / time in the air)
rxDataStep(inData = myAirlineXdf,
outFile = myAirlineXdf,
varsToKeep = c('Distance', 'AirTime'),
transforms = list(airSpeed = Distance / Airtime),
append = 'cols',
overwrite = TRUE)
# Get Variable Information for airspeed
rxGetInfo(data = myAirlineXdf,
getVarInfo = TRUE,
varsToKeep = 'airSpeed')
# Summary for the airspeed variable
rxSummary(~airSpeed,
data = myAirlineXdf)
# Construct a histogtam for airspeed
# We can use the xAxisMinMax argument to limit the X-axis
rxHistogram(~airSpeed,
data = myAirlineXdf)
rxHistogram(~airSpeed,
data = myAirlineXdf,
xNumTicks = 10,
numBreaks = 1500,
xAxisMinMax = c(0,12))
Transforming variables using rxDataStep
# Conversion to miles per hour
rxDataStep(inData = myAirlineXdf,
outFile = myAirlineXdf,
varsToKeep = c('airSpeed'),
transforms = list(airSpeed = airSpeed * 60),
overwrite = TRUE)
# Histogram for airspeed after conversion
rxHistogram(~airSpeed,
data = myAirlineXdf)
Correlations
# Correlation for departure delay, arrival delay, and air speed
rxCor(formula = ~ DepDelay + ArrDelay + airSpeed,
data = myAirlineXdf,
rowSelection = (airSpeed > 50) & (airSpeed < 800))
Linear regression
# Regression for airSpeed based on departure delay
myLMobj <- rxLinMod(formula = airSpeed ~ DepDelay,
data = myAirlineXdf,
rowSelection = (airSpeed > 50) & (airSpeed < 800))
summary(myLMobj)
Data Exploration¶
RevoScaleR
options
# Extract the names of the possible options
names(rxOptions())
# Extract the sample data directory
rxGetOption('sampleDataDir')
# View the current value of the reportProgress option
rxGetOption('reportProgress')
# Set the value of the reportProgress option to 0
rxOptions(reportProgress = 0)
Import and explore Dow Jones data
# Set up the variable that has the address of the relevant data file
djiXdf <- file.path(rxGetOption('sampleDataDir'), 'DJIAdaily.xdf')
# Get information about that dataset
rxGetInfo(djiXdf, getVarInfo = TRUE)
Extracting meta data about a variable using rxGetVarInfo
# Get variable information for the dataset
djiVarInfo <- rxGetVarInfo(djiXdf)
names(djiVarInfo)
# Extract information about the closing cost variable
(closeVarInfo <- djiVarInfo[['Close']])
# Get the class of the closeVarInfo object
class(closeVarInfo)
# Examine the structure of the closeVarInfo object
str(closeVarInfo)
# Extract the global maximum of the closing cost variable
closeMax <- closeVarInfo[['high']]
Summarizing variables with rxSummary
# Basic summary statistics
rxSummary(~ DayOfWeek + Close + Volume,
data = djiXdf)
# Frequency weighted
rxSummary(~ DayOfWeek + Close,
data = djiXdf,
fweights = 'Volume')
# Basic frequency count
rxCrossTabs(~ DayOfWeek,
data = djiXdf)
Exploring a distribution with rxHistogram
# Numeric Variables
rxHistogram(~ Close,
data = djiXdf)
# Categorical Variable
rxHistogram(~ DayOfWeek,
data = djiXdf)
# Different panels for different days of the week
rxHistogram(~ Close | DayOfWeek,
data = djiXdf)
# Numeric Variables with a frequency weighting
rxHistogram(~ Close, data = djiXdf,
fweights = 'Volume')
Plotting bivariate relationships with rxLinePlot
# Simple bivariate line plot
rxLinePlot(Close ~ DaysSince1928,
data = djiXdf)
# Using different panels for different days of the week
rxLinePlot(Close ~ DaysSince1928 | DayOfWeek,
data = djiXdf)
# Using different groups
rxLinePlot(Close ~ DaysSince1928,
groups = DayOfWeek,
data = djiXdf)
# Simple bivariate line plot, after taking the log() of the ordinate (y) variable
rxLinePlot(log(Close) ~ DaysSince1928,
data = djiXdf)
Summarzing variables with rxCrossTabs
# Compute the the summed volume for each day of the week
rxCrossTabs(formula = Volume ~ DayOfWeek,
data = djiXdf)
# Compute the the summed volume for each day of the week for each month
rxCrossTabs(formula = Volume ~ F(Month):DayOfWeek,
data = djiXdf)
# Compute the the average volume for each day of the week for each month
rxCrossTabs(formula = Volume ~ F(Month):DayOfWeek,
data = djiXdf,
means = TRUE)
# Compute the the average closing price for each day of the week for each month, using volume as frequency weights
rxCrossTabs(formula = Close ~ F(Month):DayOfWeek,
data = djiXdf,
means = TRUE,
fweights = 'Volume')
Summarzing variables with rxCube
# Compute the the summed volume for each day of the week
rxCrossTabs(Volume ~ DayOfWeek,
data = djiXdf)
rxCube(Volume ~ DayOfWeek,
data = djiXdf,
means = FALSE)
# Compute the the summed volume for each day of the week for each month
rxCrossTabs(Volume ~ F(Month):DayOfWeek,
data = djiXdf)
rxCube(Volume ~ F(Month):DayOfWeek,
data = djiXdf,
means = FALSE)
# Compute the the average volume for each day of the week for each month
rxCube(Volume ~ F(Month):DayOfWeek,
data = djiXdf)
# Compute the the average closing price for each day of the week for each month, using volume as frequency weights
rxCube(Close ~ DayOfWeek,
data = djiXdf,
fweights = 'Volume')
Data Manipulation¶
Using rxDataStep
to transform data
# Get information on mortData
rxGetInfo(mortData)
## Set up my personal copy of the data
myMortData <- 'myMD.xdf'
# Create the transform
rxDataStep(inData = mortData,
outFile = myMortData,
transforms = list(highDebtRow = ccDebt > 8000),
overwrite = TRUE)
#rxDataStep(inData = mortData, outFile = myMortData, transforms = list(highDebtRow = ccDebt > 8000))
# Get the variable information
rxGetVarInfo(myMortData)
# Get the proportion of values that are 1
rxSummary( ~ highDebtRow,
data = myMortData)
# Compute multiple transforms!
rxDataStep(inData = myMortData, outFile = myMortData,
transforms = list(
newHouse = houseAge < 10,
ccsXhd = creditScore * highDebtRow),
append = 'cols',
overwrite = TRUE)
More complex transforms using transformFuncs
# Compute the summary statistics
(csSummary <- rxSummary(~ creditScore, data = mortData))
# Extract the mean and std. deviation
meanCS <- csSummary$sDataFrame$Mean[1]
sdCS <- csSummary$sDataFrame$StdDev[1]
# Create a function to compute the scaled variable
scaleCS <- function(mylist){
mylist[['scaledCreditScore']] <- (mylist[['creditScore']] - myCenter) / myScale
return(mylist)
}
# Run it with rxDataStep (A above in B below)
myMortData <- 'myMD.xdf'
rxDataStep(inData = mortData, outFile = myMortData,
transformFunc = scaleCS,
transformObjects = list(myCenter = meanCS, myScale = sdCS))
# Check the new variable
rxGetVarInfo(myMortData)
rxSummary(~ scaledCreditScore,
data = myMortData)
Data Analysis¶
Preparing data for analysis: import
# Declare the file paths for the csv and xdf files
myAirlineCsv <- file.path(rxGetOption('sampleDataDir'), 'AirlineDemoSmall.csv')
myAirlineXdf <- 'ADS.xdf'
# Use rxImport to import the data into xdf format
rxImport(inData = myAirlineCsv,
outFile = myAirlineXdf,
overwrite = TRUE,
colInfo = list(
DayOfWeek = list(
type = 'factor',
levels = c('Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday', 'Sunday'))))
Preparing data Ffor analysis: exploration
# Summarize arrival delay for each day of the week
rxSummary(formula = ArrDelay ~ DayOfWeek,
data = myAirlineXdf)
# Vizualize the arrival delay histogram
rxHistogram(formula = ~ArrDelay,
data = myAirlineXdf)
Construct a linear model
# predict arrival delay by day of the week
myLM1 <- rxLinMod(ArrDelay ~ DayOfWeek,
data = myAirlineXdf)
# summarize the model
smmary(myLM1)
# Use the transforms argument to create a factor variable associated with departure time 'on the fly,'
# predict Arrival Delay by the interaction between Day of the week and that new factor variable
myLM2 <- rxLinMod(ArrDelay ~ DayOfWeek,
data = myAirlineXdf,
transforms = list(
catDepTime = cut(CRSDepTime, breaks = seq(from = 5, to = 23, by = 2))),
cube = TRUE)
# summarize the model
summary(myLM2)
Generating predictions and residuals
# Summarize model first
summary(myLM2)
# Path to new dataset storing predictions
myNewADS <- 'myNEWADS.xdf'
# Generate predictions
rxPredict(modelObject = myLM2,
data = myAirlineXdf,
outData = myNewADS,
writeModelVars = TRUE)
# Get information on the new dataset
rxGetInfo(myNewADS, getVarInfo = TRUE)
# Generate residuals.
rxPredict(modelObject = myLM2,
data = myAirlineXdf,
outData = myNewADS,
writeModelVars = TRUE,
computeResiduals = TRUE,
overwrite = TRUE)
# Get information on the new dataset
rxGetInfo(myNewADS, getVarInfo = TRUE)
Logistic regression
# look at the meta data
ls()
rxGetInfo(data = mortData, getVarInfo = TRUE)
# Construct the logit model
logitModel <- rxLogit(formula = default ~ houseAge + F(year) + ccDebt + creditScore + yearsEmploy,
data = mortData)
# Summarize the result contained in logitModel
summary(logitModel
Individual mortgage information
# Summarize the model
summary(logitModel)
# view the first few rows
head(newData)
# Make predictions
dataWithPredictions <- rxPredict(modelObject = logitModel,
data = newData,
outData = newData,
type = 'response')
# view the predictions
dataWithPredictions
Computing k-means with rxKmeans
# Examine the mortData dataset
rxGetInfo(mortData, getVarInfo = TRUE)
# Set up a path to a new xdf file
myNewMortData = 'myMDwithKMeans.xdf'
# Run k-means
KMout <- rxKmeans(formula = ~ ccDebt + creditScore + houseAge,
data = mortData,
outFile = myNewMortData,
rowSelection = year == 2000,
numClusters = 4,
writeModelVars = TRUE)
print(KMout)
# Examine the variables in the new dataset:
rxGetInfo(myNewMortData, getVarInfo = TRUE)
# Summarize the cluster variable:
rxSummary(~ F(.rxCluster), data = myNewMortData)
# Read into memory 10% of the data:
mydf <- rxXdfToDataFrame(myNewMortData,
rowSelection = randSamp == 1,
varsToDrop = 'year',
transforms = list(randSamp = sample(10, size = .rxNumRows, replace = TRUE)))
## Visualize the clusters
plot(mydf[-1], col = mydf$.rxCluster)
Create some decision trees
# regression tree
regTreeOut <- rxDTree(default ~ creditScore + ccDebt + yearsEmploy + houseAge,
rowSelection = year == 2000,
data = mortData, maxdepth = 5)
# print out the object
print(regTreeOut)
# plot a dendrogram, and add node labels
plot(rxAddInheritance(regTreeOut))
text(rxAddInheritance(regTreeOut))
# Another visualization
#library(RevoTreeView)
#createTreeView(regTreeOut)
# predict values
myNewData = 'myNewMortData.xdf'
rxPredict(regTreeOut,
data = mortData,
outData = myNewData,
writeModelVars = TRUE,
predVarNames = 'default_RegPred')
# visualize ROC curve
rxRocCurve(actualVarName = 'default',
predVarNames = 'default_RegPred',
data = myNewData)