Big Data Analysis with Revolution R Enterprise

Foreword

Output options: the ‘tango’ syntax and the ‘readable’ theme.
Snippets only.

Documentation¶

Microsoft R Server: previously called Revolution R Enterprise for Hadoop, Linux and Teradata and included new Microsoft enterprise support and purchasing options. Microsoft R Server was further made available to students through the Microsoft DreamSpark programme.
Microsoft R Server Developer Edition: a gratis version for developers that with a feature set akin to the commercial edition.
Microsoft Data Science Virtual Machine: an analytics tool developed by the Revolution Analytics division premiered in January 2015.
Microsoft R Open: a rebranded version of Revolution R Open.

Introduction¶

Importing data with rxImport function

# Declare the file paths for the csv and xdf files
# find the path or directory where the file is, load the path variable
myAirlineCsv <- file.path(rxGetOption('sampleDataDir'), '2007_subset.csv')

# fin the data in this directory and load the data variable
myAirlineXdf <- '2007_subset.xdf'

# Use rxImport to import the data into xdf format
# rxImport(inData = myAirlineCsv, outFile = myAirlineXdf, overwrite = TRUE)
# or
# function within a function for more stats
system.time(rxImport(inData = myAirlineCsv, 
                     outFile = myAirlineXdf, 
                     overwrite = TRUE))
list.files()

Functions for summarizing data

# Get basic information about your data
rxGetInfo(data = myAirlineXdf, 
          getVarInfo = TRUE, 
          numRows = 10)

# Summarize the variables corresponding to actual elapsed time, time in the air, departure delay, flight Distance
rxSummary(formula = ~ ActualElapsedTime + AirTime + DepDelay + Distance, 
          data = myAirlineXdf)

# Histogram of departure delays
rxHistogram(formula = ~DepDelay, 
            data = myAirlineXdf)

# Use parameters similar to a regular histogram to zero in on the interesting area
rxHistogram(formula = ~DepDelay, 
            data = myAirlineXdf, 
            xAxisMinMax = c(-100, 400), 
            numBreaks = 500,
            xNumTicks = 10)

Creating new variables using rxDataStep

# Calculate an additional variable: airspeed (distance traveled / time in the air)
rxDataStep(inData = myAirlineXdf, 
           outFile = myAirlineXdf, 
           varsToKeep = c('Distance', 'AirTime'),
           transforms = list(airSpeed = Distance / Airtime),
           append = 'cols',
           overwrite = TRUE)

# Get Variable Information for airspeed
rxGetInfo(data = myAirlineXdf, 
          getVarInfo = TRUE,
          varsToKeep = 'airSpeed')

# Summary for the airspeed variable
rxSummary(~airSpeed, 
          data = myAirlineXdf)

# Construct a histogtam for airspeed
# We can use the xAxisMinMax argument to limit the X-axis
rxHistogram(~airSpeed, 
            data = myAirlineXdf)

rxHistogram(~airSpeed, 
            data = myAirlineXdf,
            xNumTicks = 10,
            numBreaks = 1500,
            xAxisMinMax = c(0,12))

Transforming variables using rxDataStep

# Conversion to miles per hour
rxDataStep(inData = myAirlineXdf, 
         outFile = myAirlineXdf, 
         varsToKeep = c('airSpeed'),
           transforms = list(airSpeed = airSpeed * 60),
         overwrite = TRUE)

# Histogram for airspeed after conversion
rxHistogram(~airSpeed, 
            data = myAirlineXdf)

Correlations

# Correlation for departure delay, arrival delay, and air speed
rxCor(formula = ~ DepDelay + ArrDelay + airSpeed,
      data = myAirlineXdf,
      rowSelection = (airSpeed > 50) & (airSpeed < 800))

Linear regression

# Regression for airSpeed based on departure delay
myLMobj <- rxLinMod(formula = airSpeed ~ DepDelay, 
         data = myAirlineXdf,
         rowSelection = (airSpeed > 50) & (airSpeed < 800))

summary(myLMobj)

Data Exploration¶

RevoScaleR options

# Extract the names of the possible options
names(rxOptions())

# Extract the sample data directory
rxGetOption('sampleDataDir')

# View the current value of the reportProgress option
rxGetOption('reportProgress')

# Set the value of the reportProgress option to 0
rxOptions(reportProgress = 0)

Import and explore Dow Jones data

# Set up the variable that has the address of the relevant data file
djiXdf <- file.path(rxGetOption('sampleDataDir'), 'DJIAdaily.xdf')

# Get information about that dataset
rxGetInfo(djiXdf, getVarInfo = TRUE)

Extracting meta data about a variable using rxGetVarInfo

# Get variable information for the dataset
djiVarInfo <- rxGetVarInfo(djiXdf)
names(djiVarInfo)

# Extract information about the closing cost variable
(closeVarInfo <- djiVarInfo[['Close']])

# Get the class of the closeVarInfo object
class(closeVarInfo)

# Examine the structure of the closeVarInfo object
str(closeVarInfo)

# Extract the global maximum of the closing cost variable
closeMax <- closeVarInfo[['high']]

Summarizing variables with rxSummary

# Basic summary statistics
rxSummary(~ DayOfWeek + Close + Volume, 
          data = djiXdf)

# Frequency weighted
rxSummary(~ DayOfWeek + Close, 
          data = djiXdf, 
          fweights = 'Volume')

# Basic frequency count
rxCrossTabs(~ DayOfWeek, 
            data = djiXdf)

Exploring a distribution with rxHistogram

# Numeric Variables
rxHistogram(~ Close, 
            data = djiXdf)

# Categorical Variable
rxHistogram(~ DayOfWeek, 
            data = djiXdf)

# Different panels for different days of the week
rxHistogram(~ Close | DayOfWeek, 
            data = djiXdf)

# Numeric Variables with a frequency weighting
rxHistogram(~ Close, data = djiXdf, 
            fweights = 'Volume')

Plotting bivariate relationships with rxLinePlot

# Simple bivariate line plot
rxLinePlot(Close ~ DaysSince1928, 
           data = djiXdf)

# Using different panels for different days of the week
rxLinePlot(Close ~ DaysSince1928 | DayOfWeek, 
           data = djiXdf)

# Using different groups
rxLinePlot(Close ~ DaysSince1928, 
           groups = DayOfWeek, 
           data = djiXdf)

# Simple bivariate line plot, after taking the log() of the ordinate (y) variable
rxLinePlot(log(Close) ~ DaysSince1928, 
           data = djiXdf)

Summarzing variables with rxCrossTabs

# Compute the the summed volume for each day of the week
rxCrossTabs(formula = Volume ~ DayOfWeek, 
            data = djiXdf)

# Compute the the summed volume for each day of the week for each month
rxCrossTabs(formula = Volume ~ F(Month):DayOfWeek, 
            data = djiXdf)

# Compute the the average volume for each day of the week for each month
rxCrossTabs(formula = Volume ~ F(Month):DayOfWeek, 
            data = djiXdf, 
            means = TRUE)

# Compute the the average closing price for each day of the week for each month, using volume as frequency weights
rxCrossTabs(formula = Close ~ F(Month):DayOfWeek, 
            data = djiXdf, 
            means = TRUE, 
            fweights = 'Volume')

Summarzing variables with rxCube

# Compute the the summed volume for each day of the week
rxCrossTabs(Volume ~ DayOfWeek, 
            data = djiXdf)

rxCube(Volume ~ DayOfWeek, 
       data = djiXdf, 
       means = FALSE)

# Compute the the summed volume for each day of the week for each month
rxCrossTabs(Volume ~ F(Month):DayOfWeek, 
            data = djiXdf)

rxCube(Volume ~ F(Month):DayOfWeek, 
       data = djiXdf, 
       means = FALSE)

# Compute the the average volume for each day of the week for each month
rxCube(Volume ~ F(Month):DayOfWeek, 
       data = djiXdf)

# Compute the the average closing price for each day of the week for each month, using volume as frequency weights
rxCube(Close ~ DayOfWeek, 
       data = djiXdf, 
       fweights = 'Volume')

Data Manipulation¶

Using rxDataStep to transform data

# Get information on mortData
rxGetInfo(mortData)

## Set up my personal copy of the data
myMortData <- 'myMD.xdf'

# Create the transform
rxDataStep(inData = mortData, 
           outFile = myMortData, 
           transforms = list(highDebtRow = ccDebt > 8000), 
           overwrite = TRUE)

#rxDataStep(inData = mortData, outFile = myMortData, transforms = list(highDebtRow = ccDebt > 8000))
# Get the variable information
rxGetVarInfo(myMortData)

# Get the proportion of values that are 1
rxSummary( ~ highDebtRow, 
           data = myMortData)

# Compute multiple transforms!
rxDataStep(inData = myMortData, outFile = myMortData,
           transforms = list(
             newHouse = houseAge < 10,
             ccsXhd = creditScore * highDebtRow),
           append = 'cols',
           overwrite = TRUE)

More complex transforms using transformFuncs

# Compute the summary statistics
(csSummary <- rxSummary(~ creditScore, data = mortData))

# Extract the mean and std. deviation
meanCS <- csSummary$sDataFrame$Mean[1]
sdCS <- csSummary$sDataFrame$StdDev[1]

# Create a function to compute the scaled variable
scaleCS <- function(mylist){
  mylist[['scaledCreditScore']] <- (mylist[['creditScore']] - myCenter) / myScale
  return(mylist)
}

# Run it with rxDataStep (A above in B below)
myMortData <- 'myMD.xdf'
rxDataStep(inData = mortData, outFile = myMortData,
           transformFunc = scaleCS,
           transformObjects = list(myCenter = meanCS, myScale = sdCS))

# Check the new variable
rxGetVarInfo(myMortData)
rxSummary(~ scaledCreditScore, 
          data = myMortData)

Data Analysis¶

Preparing data for analysis: import

# Declare the file paths for the csv and xdf files
myAirlineCsv <- file.path(rxGetOption('sampleDataDir'), 'AirlineDemoSmall.csv')
myAirlineXdf <- 'ADS.xdf'

# Use rxImport to import the data into xdf format
rxImport(inData = myAirlineCsv, 
         outFile = myAirlineXdf, 
         overwrite = TRUE,
         colInfo = list( 
           DayOfWeek = list(
            type = 'factor', 
            levels = c('Monday', 'Tuesday', 'Wednesday', 
                       'Thursday', 'Friday', 'Saturday', 'Sunday'))))

Preparing data Ffor analysis: exploration

# Summarize arrival delay for each day of the week
rxSummary(formula = ArrDelay ~ DayOfWeek, 
          data = myAirlineXdf)

# Vizualize the arrival delay histogram
rxHistogram(formula = ~ArrDelay, 
            data = myAirlineXdf)

Construct a linear model

# predict arrival delay by day of the week
myLM1 <- rxLinMod(ArrDelay ~ DayOfWeek, 
                  data = myAirlineXdf)

# summarize the model
smmary(myLM1)

# Use the transforms argument to create a factor variable associated with departure time 'on the fly,'
# predict Arrival Delay by the interaction between Day of the week and that new factor variable
myLM2 <- rxLinMod(ArrDelay ~ DayOfWeek, 
                  data = myAirlineXdf,
                  transforms = list(
                    catDepTime = cut(CRSDepTime, breaks = seq(from = 5, to = 23, by = 2))),
                    cube = TRUE)

# summarize the model
summary(myLM2)

Generating predictions and residuals

# Summarize model first
summary(myLM2)

# Path to new dataset storing predictions
myNewADS <- 'myNEWADS.xdf'

# Generate predictions
rxPredict(modelObject = myLM2, 
          data = myAirlineXdf, 
          outData = myNewADS, 
          writeModelVars = TRUE)

# Get information on the new dataset
rxGetInfo(myNewADS, getVarInfo = TRUE)

# Generate residuals.
rxPredict(modelObject = myLM2, 
          data = myAirlineXdf, 
          outData = myNewADS, 
          writeModelVars = TRUE, 
          computeResiduals = TRUE, 
          overwrite = TRUE)

# Get information on the new dataset
rxGetInfo(myNewADS, getVarInfo = TRUE)

Logistic regression

# look at the meta data
ls()
rxGetInfo(data = mortData, getVarInfo = TRUE)

# Construct the logit model
logitModel <- rxLogit(formula = default ~ houseAge + F(year) + ccDebt + creditScore + yearsEmploy, 
                      data = mortData)

# Summarize the result contained in logitModel
summary(logitModel

Individual mortgage information

# Summarize the model
summary(logitModel)

# view the first few rows
head(newData)

# Make predictions
dataWithPredictions <- rxPredict(modelObject = logitModel, 
                                 data = newData, 
                                 outData = newData, 
                                 type = 'response')

# view the predictions
dataWithPredictions

Computing k-means with rxKmeans

# Examine the mortData dataset
rxGetInfo(mortData, getVarInfo = TRUE)

# Set up a path to a new xdf file
myNewMortData = 'myMDwithKMeans.xdf'

# Run k-means
KMout <- rxKmeans(formula = ~ ccDebt + creditScore + houseAge, 
         data = mortData,
         outFile = myNewMortData,
         rowSelection = year == 2000,
         numClusters = 4,
         writeModelVars = TRUE)

print(KMout)

# Examine the variables in the new dataset:
rxGetInfo(myNewMortData, getVarInfo = TRUE)

# Summarize the cluster variable:
rxSummary(~ F(.rxCluster), data = myNewMortData)

# Read into memory 10% of the data:
mydf <- rxXdfToDataFrame(myNewMortData,
                         rowSelection = randSamp == 1,
                         varsToDrop = 'year',
                         transforms = list(randSamp = sample(10, size = .rxNumRows, replace = TRUE)))

## Visualize the clusters
plot(mydf[-1], col = mydf$.rxCluster)

Create some decision trees

# regression tree
regTreeOut <- rxDTree(default ~ creditScore + ccDebt + yearsEmploy + houseAge, 
                      rowSelection = year == 2000, 
                      data = mortData, maxdepth = 5)

# print out the object
print(regTreeOut)

# plot a dendrogram, and add node labels
plot(rxAddInheritance(regTreeOut))
text(rxAddInheritance(regTreeOut))

# Another visualization
#library(RevoTreeView)
#createTreeView(regTreeOut)
# predict values
myNewData = 'myNewMortData.xdf'

rxPredict(regTreeOut,
          data = mortData,
          outData = myNewData,
          writeModelVars = TRUE,
          predVarNames = 'default_RegPred')

# visualize ROC curve
rxRocCurve(actualVarName = 'default', 
           predVarNames = 'default_RegPred', 
           data = myNewData)