Build a pipeline

Load libraries

library(autotextclassifier) # Auto text classifier 
library(parallel) # Parallel processing 
library(doParallel) # Parallel processing 
library(here) # Creating reproducible file paths 
library(patchwork) # Putting ggplots together
library(recipes) # Preprocessing 
library(zeallot) # Multiple assignments 
library(yardstick) # Metrics

Import data

load(file = here("inst/extdata/sample_data.rda"))

Data munging

Don’t forget to make sure that the type of the outcome variable should be factor.

names(sample_data) <- c("category", "org_name", "ein", "text")
sample_data$category <- as.factor(sample_data$category)

Apply basic recipe

The rec object provides the following information. The function also checks whether the text column has missing values or includes extremely short documents (less than five words).

There are two broad basic options for text preprocessing.

Without word embedding

Tokenization for text [trained]
Stop word removal for text [trained]
Text filtering for text [trained]
Term frequency-inverse document frequency with text [trained]

With word embedding

Tokenization for text [trained]
Stop word removal for text [trained]
Text filtering for text [trained]
Word embeddings aggregated from text [trained]

# Without word embedding
rec <- apply_basic_recipe(sample_data, category ~ text, text)

# With word embedding 
rec_alt <- apply_basic_recipe(sample_data, category ~ text, text, add_embedding = TRUE)

The build_pipeline function reduces the steps one needs to take a classifier pipeline. The pipeline involves data splitting, creating tuning parameters, search spaces, workflows, 10-fold cross-validation samples, finding the best model from each algorithm and fitting the best model from each algorithm to the data.

# Using parallel processing to speed up
all_cores <- parallel::detectCores(logical = FALSE)

cl <- makeCluster(all_cores[1] - 1)

registerDoParallel(cl)

set.seed(1234)

c(lasso_fit, rand_fit, xg_fit) %<-% build_pipeline(rec, category, rec, prop_ratio = 0.8, metric_choice = "roc_auc")

Evaluate the model using visualization

# Based on the class-based metrics 

viz_class_fit(lasso_fit, "Lasso", test_x_class, test_y_class, "class") + 

viz_class_fit(rand_fit, "Random forest", test_x_class, test_y_class, "class") +

viz_class_fit(xg_fit, "XGBoost", test_x_class, test_y_class, "class")

# Based on the probability-based metrics 

viz_class_fit(lasso_fit, "Lasso", test_x_class, test_y_class, "probability") +

viz_class_fit(rand_fit, "Random forest", test_x_class, test_y_class, "probability") +

viz_class_fit(xg_fit, "XGBoost", test_x_class, test_y_class, "probability")