Classification Models

Libraries

Code

library(tidyverse)
library(skimr)
library(gtExtras)
library(tidymodels)
library(future)

Import/Load Data

Code

birdclef_2026train <- read_csv("C:/Users/foma/Downloads/train.csv")

Get summary : glimpse & skim

Code

birdclef_2026train %>% 
  glimpse() %>% 
  skim()

Rows: 35,549
Columns: 15
$ primary_label    <chr> "1161364", "1161364", "1161364", "1161364", "1161364"…
$ secondary_labels <chr> "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]",…
$ type             <chr> "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]",…
$ latitude         <dbl> -22.7562, -22.7558, -22.7547, -22.7547, -22.7426, -22…
$ longitude        <dbl> -46.8666, -46.8700, -46.8728, -46.8728, -46.8985, -47…
$ scientific_name  <chr> "Guyalna cuta", "Guyalna cuta", "Guyalna cuta", "Guya…
$ common_name      <chr> "Guyalna cuta", "Guyalna cuta", "Guyalna cuta", "Guya…
$ class_name       <chr> "Insecta", "Insecta", "Insecta", "Insecta", "Insecta"…
$ inat_taxon_id    <dbl> 1161364, 1161364, 1161364, 1161364, 1161364, 1161364,…
$ author           <chr> "Lucas Barbosa", "Lucas Barbosa", "Lucas Barbosa", "L…
$ license          <chr> "cc-by-nc", "cc-by-nc", "cc-by-nc", "cc-by-nc", "cc-b…
$ rating           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ url              <chr> "https://static.inaturalist.org/sounds/1216197.mp3?17…
$ filename         <chr> "1161364/iNat1216197.ogg", "1161364/iNat1114648.ogg",…
$ collection       <chr> "iNat", "iNat", "iNat", "iNat", "iNat", "iNat", "iNat…

Data summary
Name	Piped data
Number of rows	35549
Number of columns	15
_______________________
Column type frequency:
character	11
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
primary_label	1	5	7	206
secondary_labels	1	2	161	1517
type	1	2	100	755
scientific_name	1	10	31	206
common_name	1	6	32	206
class_name	1	4	8	5
author	1	1	90	4010
license	1	3	11	7
url	1	33	61	35549
filename	1	15	23	35549
collection	1	2	4	2

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
latitude	1	-8.17	20.25	-54.86	-23.36	-14.88	4.64	69.58	▂▇▃▂▁
longitude	1	-60.74	25.43	-159.66	-75.14	-58.13	-48.73	175.32	▁▇▁▁▁
inat_taxon_id	1	80221.32	242247.61	7.00	8830.00	15957.00	19627.00	1595929.00	▇▁▁▁▁
rating	1	2.60	2.07	0.00	0.00	3.50	4.50	5.00	▇▁▂▆▆

Get the bird names and arrange in descending order

Code

birdclef_2026train %>% 
  count(scientific_name, class_name, sort = TRUE )

# A tibble: 206 × 3
   scientific_name          class_name     n
   <chr>                    <chr>      <int>
 1 Turdus rufiventris       Aves         499
 2 Coereba flaveola         Aves         498
 3 Glaucidium brasilianum   Aves         497
 4 Vanellus chilensis       Aves         497
 5 Passer domesticus        Aves         496
 6 Micrastur semitorquatus  Aves         495
 7 Pandion haliaetus        Aves         495
 8 Myiozetetes similis      Aves         494
 9 Nyctidromus albicollis   Aves         493
10 Tolmomyias sulphurescens Aves         493
# ℹ 196 more rows

[The most populated bird is Turdus rufiventris, find image below]

[Image download site](https://www.magnific.com/search?format=search&last_filter=page&last_value=1&page=1&query=Turdus+rufiventris)

Model the specific sound type (song, call, flight call, etc.) as a multi-class classification problem

Code

#note on creating != : shift+! then press =

xc_data <- birdclef_2026train |>
  filter(collection == "XC", type != "[]")

Count raw type values

Code

xc_data |>
  count(type, sort = TRUE)

# A tibble: 754 × 2
   type                         n
   <chr>                    <int>
 1 ['song']                 10376
 2 ['call']                  6205
 3 ['call', ' song']         1458
 4 ['flight call']            711
 5 ['alarm call']             423
 6 ['dawn song']              216
 7 ['uncertain']              191
 8 ['canto']                  171
 9 ['duet', ' song']          171
10 ['call', ' flight call']   162
# ℹ 744 more rows

Count total distinct type combination(754 distinct combinations — many are multi-label)

Code

xc_data |> count(type) |> nrow()

[1] 754

Extract first type by stripping list formatting and taking first element

Code

xc_clean <- xc_data |>
  mutate(
    type_primary = type |>
      str_remove_all("\\[|\\]|'") |>   # remove [ ] '
      str_split(",") |>
      map_chr(\(x) str_trim(x[[1]]))    # take first element, trim whitespace
  )

Count the cleaned type_primary value

Code

xc_clean |>
  count(type_primary, sort = TRUE)

# A tibble: 127 × 2
   type_primary     n
   <chr>        <int>
 1 song         10692
 2 call          8715
 3 flight call    868
 4 alarm call     749
 5 duet           302
 6 dawn song      249
 7 uncertain      194
 8 canto          171
 9 begging call   123
10 drumming        71
# ℹ 117 more rows

Count the cleaned distinct type_primary combination(127 distinct first element)

Code

xc_clean |>
  count(type_primary, sort = TRUE) |>
  nrow()

[1] 127

Build cleaned multi-class outcome with lumped categories

Code

ambiguous <- c("uncertain", "?", "song?", "call?")

xc_model <- xc_clean |>
  filter(!type_primary %in% ambiguous) |>
  mutate(
    sound_type = fct_lump_min(type_primary, min = 50, other_level = "other") |>
      fct_infreq()
  ) |>
  select(sound_type, class_name, rating, latitude, longitude, license)

Split XC data and create cross-validation folds

Code

set.seed(7304)
xc_split  <- initial_split(xc_model, strata = sound_type)
xc_tr     <- training(xc_split)
xc_te     <- testing(xc_split)
xc_folds  <- vfold_cv(xc_tr, v = 5, strata = sound_type)

Define interested metrics

Code

xc_metrics <- metric_set(roc_auc, accuracy, brier_class)

Define recipes

Code

rf_rec <- recipe(sound_type ~ ., data = xc_tr) |>
  step_novel(all_nominal_predictors()) |>
  step_dummy(all_nominal_predictors()) |>
  step_zv(all_predictors())

Tune the random forest(mtry and min_n with over 20 point grid) to optimize ROC_AUC

Code

rf_tune_spec <- rand_forest(
  mode  = "classification",
  mtry  = tune(),
  min_n = tune()
) |>
  set_engine("ranger")

Code

rf_tune_wf <- workflow() |>
  add_recipe(rf_rec) |>
  add_model(rf_tune_spec)

Code

set.seed(2951)
rf_tune_res <- tune_grid(
  rf_tune_wf,
  resamples = xc_folds,
  metrics   = xc_metrics,
  grid      = 20
)

Code

show_best(rf_tune_res, metric = "roc_auc")

# A tibble: 5 × 8
   mtry min_n .metric .estimator  mean     n std_err .config         
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
1     4     8 roc_auc hand_till  0.708     5 0.00411 pre0_mod12_post0
2     3     2 roc_auc hand_till  0.706     5 0.00466 pre0_mod09_post0
3     4    40 roc_auc hand_till  0.705     5 0.00558 pre0_mod14_post0
4     5    36 roc_auc hand_till  0.705     5 0.00564 pre0_mod17_post0
5     3    22 roc_auc hand_till  0.704     5 0.00561 pre0_mod10_post0

Visualize tuning result

Code

autoplot(rf_tune_res, metric = "roc_auc")

Extract all result for the best tune random forest configuration

Code

best_rf_params <- select_best(rf_tune_res, metric = "roc_auc")

Get all metrics for the best configuration

Code

rf_tune_res |>
  collect_metrics() |>
  inner_join(best_rf_params |> select(mtry, min_n, .config),
             by = c("mtry", "min_n", ".config"))

# A tibble: 3 × 8
   mtry min_n .metric     .estimator  mean     n std_err .config         
  <int> <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>           
1     4     8 accuracy    multiclass 0.537     5 0.00332 pre0_mod12_post0
2     4     8 brier_class multiclass 0.302     5 0.00140 pre0_mod12_post0
3     4     8 roc_auc     hand_till  0.708     5 0.00411 pre0_mod12_post0

Fit final tuned random forest on full training set and evaluate on test

Code

final_rf_wf <- finalize_workflow(rf_tune_wf, best_rf_params)

final_fit <- last_fit(final_rf_wf, split = xc_split, metrics = xc_metrics)
collect_metrics(final_fit)

# A tibble: 3 × 4
  .metric     .estimator .estimate .config        
  <chr>       <chr>          <dbl> <chr>          
1 accuracy    multiclass     0.532 pre0_mod0_post0
2 roc_auc     hand_till      0.715 pre0_mod0_post0
3 brier_class multiclass     0.303 pre0_mod0_post0

Plot confusion matrix

Code

collect_predictions(final_fit) |>
  conf_mat(truth = sound_type, estimate = .pred_class) |>
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix — Tuned Random Forest (Test Set)")