library(autotextclassifier) # Auto text classifier
library(parallel) # Parallel processing
library(doParallel) # Parallel processing
library(here) # Creating reproducible file paths
library(patchwork) # Putting ggplots together
library(recipes) # Preprocessing
library(zeallot) # Multiple assignments
library(yardstick) # Metrics
The rec
object provides the following information. The
function also checks whether the text
column has missing
values or includes extremely short documents (less than five words).
There are two broad basic options for text preprocessing.
# Without word embedding
rec <- apply_basic_recipe(sample_data, category ~ text, text)
# With word embedding
rec_alt <- apply_basic_recipe(sample_data, category ~ text, text, add_embedding = TRUE)
The build_pipeline
function reduces the steps one needs
to take a classifier pipeline. The pipeline involves data splitting,
creating tuning parameters, search spaces, workflows, 10-fold
cross-validation samples, finding the best model from each algorithm and
fitting the best model from each algorithm to the data.
# Using parallel processing to speed up
all_cores <- parallel::detectCores(logical = FALSE)
cl <- makeCluster(all_cores[1] - 1)
registerDoParallel(cl)
set.seed(1234)
c(lasso_fit, rand_fit, xg_fit) %<-% build_pipeline(rec, category, rec, prop_ratio = 0.8, metric_choice = "roc_auc")
# Based on the class-based metrics
viz_class_fit(lasso_fit, "Lasso", test_x_class, test_y_class, "class") +
viz_class_fit(rand_fit, "Random forest", test_x_class, test_y_class, "class") +
viz_class_fit(xg_fit, "XGBoost", test_x_class, test_y_class, "class")
# Based on the probability-based metrics
viz_class_fit(lasso_fit, "Lasso", test_x_class, test_y_class, "probability") +
viz_class_fit(rand_fit, "Random forest", test_x_class, test_y_class, "probability") +
viz_class_fit(xg_fit, "XGBoost", test_x_class, test_y_class, "probability")