Classification Models

Libraries

Code
library(tidyverse)
library(skimr)
library(gtExtras)
library(tidymodels)
library(future)

Import/Load Data

Code
birdclef_2026train <- read_csv("C:/Users/foma/Downloads/train.csv")

Get summary : glimpse & skim

Code
birdclef_2026train %>% 
  glimpse() %>% 
  skim()
Rows: 35,549
Columns: 15
$ primary_label    <chr> "1161364", "1161364", "1161364", "1161364", "1161364"…
$ secondary_labels <chr> "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]",…
$ type             <chr> "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]",…
$ latitude         <dbl> -22.7562, -22.7558, -22.7547, -22.7547, -22.7426, -22…
$ longitude        <dbl> -46.8666, -46.8700, -46.8728, -46.8728, -46.8985, -47…
$ scientific_name  <chr> "Guyalna cuta", "Guyalna cuta", "Guyalna cuta", "Guya…
$ common_name      <chr> "Guyalna cuta", "Guyalna cuta", "Guyalna cuta", "Guya…
$ class_name       <chr> "Insecta", "Insecta", "Insecta", "Insecta", "Insecta"…
$ inat_taxon_id    <dbl> 1161364, 1161364, 1161364, 1161364, 1161364, 1161364,…
$ author           <chr> "Lucas Barbosa", "Lucas Barbosa", "Lucas Barbosa", "L…
$ license          <chr> "cc-by-nc", "cc-by-nc", "cc-by-nc", "cc-by-nc", "cc-b…
$ rating           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ url              <chr> "https://static.inaturalist.org/sounds/1216197.mp3?17…
$ filename         <chr> "1161364/iNat1216197.ogg", "1161364/iNat1114648.ogg",…
$ collection       <chr> "iNat", "iNat", "iNat", "iNat", "iNat", "iNat", "iNat…
Data summary
Name Piped data
Number of rows 35549
Number of columns 15
_______________________
Column type frequency:
character 11
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
primary_label 0 1 5 7 0 206 0
secondary_labels 0 1 2 161 0 1517 0
type 0 1 2 100 0 755 0
scientific_name 0 1 10 31 0 206 0
common_name 0 1 6 32 0 206 0
class_name 0 1 4 8 0 5 0
author 0 1 1 90 0 4010 0
license 0 1 3 11 0 7 0
url 0 1 33 61 0 35549 0
filename 0 1 15 23 0 35549 0
collection 0 1 2 4 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
latitude 0 1 -8.17 20.25 -54.86 -23.36 -14.88 4.64 69.58 ▂▇▃▂▁
longitude 0 1 -60.74 25.43 -159.66 -75.14 -58.13 -48.73 175.32 ▁▇▁▁▁
inat_taxon_id 0 1 80221.32 242247.61 7.00 8830.00 15957.00 19627.00 1595929.00 ▇▁▁▁▁
rating 0 1 2.60 2.07 0.00 0.00 3.50 4.50 5.00 ▇▁▂▆▆

Get the bird names and arrange in descending order

Code
birdclef_2026train %>% 
  count(scientific_name, class_name, sort = TRUE )
# A tibble: 206 × 3
   scientific_name          class_name     n
   <chr>                    <chr>      <int>
 1 Turdus rufiventris       Aves         499
 2 Coereba flaveola         Aves         498
 3 Glaucidium brasilianum   Aves         497
 4 Vanellus chilensis       Aves         497
 5 Passer domesticus        Aves         496
 6 Micrastur semitorquatus  Aves         495
 7 Pandion haliaetus        Aves         495
 8 Myiozetetes similis      Aves         494
 9 Nyctidromus albicollis   Aves         493
10 Tolmomyias sulphurescens Aves         493
# ℹ 196 more rows

[The most populated bird is Turdus rufiventris, find image below]

[Image download site](https://www.magnific.com/search?format=search&last_filter=page&last_value=1&page=1&query=Turdus+rufiventris)

Model the specific sound type (song, call, flight call, etc.) as a multi-class classification problem

Code
#note on creating != : shift+! then press =

xc_data <- birdclef_2026train |>
  filter(collection == "XC", type != "[]")

Count raw type values

Code
xc_data |>
  count(type, sort = TRUE)
# A tibble: 754 × 2
   type                         n
   <chr>                    <int>
 1 ['song']                 10376
 2 ['call']                  6205
 3 ['call', ' song']         1458
 4 ['flight call']            711
 5 ['alarm call']             423
 6 ['dawn song']              216
 7 ['uncertain']              191
 8 ['canto']                  171
 9 ['duet', ' song']          171
10 ['call', ' flight call']   162
# ℹ 744 more rows

Count total distinct type combination(754 distinct combinations — many are multi-label)

Code
xc_data |> count(type) |> nrow()
[1] 754

Extract first type by stripping list formatting and taking first element

Code
xc_clean <- xc_data |>
  mutate(
    type_primary = type |>
      str_remove_all("\\[|\\]|'") |>   # remove [ ] '
      str_split(",") |>
      map_chr(\(x) str_trim(x[[1]]))    # take first element, trim whitespace
  )

Count the cleaned type_primary value

Code
xc_clean |>
  count(type_primary, sort = TRUE)
# A tibble: 127 × 2
   type_primary     n
   <chr>        <int>
 1 song         10692
 2 call          8715
 3 flight call    868
 4 alarm call     749
 5 duet           302
 6 dawn song      249
 7 uncertain      194
 8 canto          171
 9 begging call   123
10 drumming        71
# ℹ 117 more rows

Count the cleaned distinct type_primary combination(127 distinct first element)

Code
xc_clean |>
  count(type_primary, sort = TRUE) |>
  nrow()
[1] 127

Build cleaned multi-class outcome with lumped categories

Code
ambiguous <- c("uncertain", "?", "song?", "call?")

xc_model <- xc_clean |>
  filter(!type_primary %in% ambiguous) |>
  mutate(
    sound_type = fct_lump_min(type_primary, min = 50, other_level = "other") |>
      fct_infreq()
  ) |>
  select(sound_type, class_name, rating, latitude, longitude, license)

Split XC data and create cross-validation folds

Code
set.seed(7304)
xc_split  <- initial_split(xc_model, strata = sound_type)
xc_tr     <- training(xc_split)
xc_te     <- testing(xc_split)
xc_folds  <- vfold_cv(xc_tr, v = 5, strata = sound_type)

Define interested metrics

Code
xc_metrics <- metric_set(roc_auc, accuracy, brier_class)

Define recipes

Code
rf_rec <- recipe(sound_type ~ ., data = xc_tr) |>
  step_novel(all_nominal_predictors()) |>
  step_dummy(all_nominal_predictors()) |>
  step_zv(all_predictors())

Tune the random forest(mtry and min_n with over 20 point grid) to optimize ROC_AUC

Code
rf_tune_spec <- rand_forest(
  mode  = "classification",
  mtry  = tune(),
  min_n = tune()
) |>
  set_engine("ranger")
Code
rf_tune_wf <- workflow() |>
  add_recipe(rf_rec) |>
  add_model(rf_tune_spec)
Code
set.seed(2951)
rf_tune_res <- tune_grid(
  rf_tune_wf,
  resamples = xc_folds,
  metrics   = xc_metrics,
  grid      = 20
)
Code
show_best(rf_tune_res, metric = "roc_auc")
# A tibble: 5 × 8
   mtry min_n .metric .estimator  mean     n std_err .config         
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
1     4     8 roc_auc hand_till  0.708     5 0.00411 pre0_mod12_post0
2     3     2 roc_auc hand_till  0.706     5 0.00466 pre0_mod09_post0
3     4    40 roc_auc hand_till  0.705     5 0.00558 pre0_mod14_post0
4     5    36 roc_auc hand_till  0.705     5 0.00564 pre0_mod17_post0
5     3    22 roc_auc hand_till  0.704     5 0.00561 pre0_mod10_post0

Visualize tuning result

Code
autoplot(rf_tune_res, metric = "roc_auc")

Extract all result for the best tune random forest configuration

Code
best_rf_params <- select_best(rf_tune_res, metric = "roc_auc")

Get all metrics for the best configuration

Code
rf_tune_res |>
  collect_metrics() |>
  inner_join(best_rf_params |> select(mtry, min_n, .config),
             by = c("mtry", "min_n", ".config"))
# A tibble: 3 × 8
   mtry min_n .metric     .estimator  mean     n std_err .config         
  <int> <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>           
1     4     8 accuracy    multiclass 0.537     5 0.00332 pre0_mod12_post0
2     4     8 brier_class multiclass 0.302     5 0.00140 pre0_mod12_post0
3     4     8 roc_auc     hand_till  0.708     5 0.00411 pre0_mod12_post0

Fit final tuned random forest on full training set and evaluate on test

Code
final_rf_wf <- finalize_workflow(rf_tune_wf, best_rf_params)

final_fit <- last_fit(final_rf_wf, split = xc_split, metrics = xc_metrics)
collect_metrics(final_fit)
# A tibble: 3 × 4
  .metric     .estimator .estimate .config        
  <chr>       <chr>          <dbl> <chr>          
1 accuracy    multiclass     0.532 pre0_mod0_post0
2 roc_auc     hand_till      0.715 pre0_mod0_post0
3 brier_class multiclass     0.303 pre0_mod0_post0

Plot confusion matrix

Code
collect_predictions(final_fit) |>
  conf_mat(truth = sound_type, estimate = .pred_class) |>
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix — Tuned Random Forest (Test Set)")