Code
library(tidyverse)
library(skimr)
library(gtExtras)
library(tidymodels)
library(future)library(tidyverse)
library(skimr)
library(gtExtras)
library(tidymodels)
library(future)birdclef_2026train <- read_csv("C:/Users/foma/Downloads/train.csv")birdclef_2026train %>%
glimpse() %>%
skim()Rows: 35,549
Columns: 15
$ primary_label <chr> "1161364", "1161364", "1161364", "1161364", "1161364"…
$ secondary_labels <chr> "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]",…
$ type <chr> "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]",…
$ latitude <dbl> -22.7562, -22.7558, -22.7547, -22.7547, -22.7426, -22…
$ longitude <dbl> -46.8666, -46.8700, -46.8728, -46.8728, -46.8985, -47…
$ scientific_name <chr> "Guyalna cuta", "Guyalna cuta", "Guyalna cuta", "Guya…
$ common_name <chr> "Guyalna cuta", "Guyalna cuta", "Guyalna cuta", "Guya…
$ class_name <chr> "Insecta", "Insecta", "Insecta", "Insecta", "Insecta"…
$ inat_taxon_id <dbl> 1161364, 1161364, 1161364, 1161364, 1161364, 1161364,…
$ author <chr> "Lucas Barbosa", "Lucas Barbosa", "Lucas Barbosa", "L…
$ license <chr> "cc-by-nc", "cc-by-nc", "cc-by-nc", "cc-by-nc", "cc-b…
$ rating <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ url <chr> "https://static.inaturalist.org/sounds/1216197.mp3?17…
$ filename <chr> "1161364/iNat1216197.ogg", "1161364/iNat1114648.ogg",…
$ collection <chr> "iNat", "iNat", "iNat", "iNat", "iNat", "iNat", "iNat…
| Name | Piped data |
| Number of rows | 35549 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| primary_label | 0 | 1 | 5 | 7 | 0 | 206 | 0 |
| secondary_labels | 0 | 1 | 2 | 161 | 0 | 1517 | 0 |
| type | 0 | 1 | 2 | 100 | 0 | 755 | 0 |
| scientific_name | 0 | 1 | 10 | 31 | 0 | 206 | 0 |
| common_name | 0 | 1 | 6 | 32 | 0 | 206 | 0 |
| class_name | 0 | 1 | 4 | 8 | 0 | 5 | 0 |
| author | 0 | 1 | 1 | 90 | 0 | 4010 | 0 |
| license | 0 | 1 | 3 | 11 | 0 | 7 | 0 |
| url | 0 | 1 | 33 | 61 | 0 | 35549 | 0 |
| filename | 0 | 1 | 15 | 23 | 0 | 35549 | 0 |
| collection | 0 | 1 | 2 | 4 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| latitude | 0 | 1 | -8.17 | 20.25 | -54.86 | -23.36 | -14.88 | 4.64 | 69.58 | ▂▇▃▂▁ |
| longitude | 0 | 1 | -60.74 | 25.43 | -159.66 | -75.14 | -58.13 | -48.73 | 175.32 | ▁▇▁▁▁ |
| inat_taxon_id | 0 | 1 | 80221.32 | 242247.61 | 7.00 | 8830.00 | 15957.00 | 19627.00 | 1595929.00 | ▇▁▁▁▁ |
| rating | 0 | 1 | 2.60 | 2.07 | 0.00 | 0.00 | 3.50 | 4.50 | 5.00 | ▇▁▂▆▆ |
birdclef_2026train %>%
count(scientific_name, class_name, sort = TRUE )# A tibble: 206 × 3
scientific_name class_name n
<chr> <chr> <int>
1 Turdus rufiventris Aves 499
2 Coereba flaveola Aves 498
3 Glaucidium brasilianum Aves 497
4 Vanellus chilensis Aves 497
5 Passer domesticus Aves 496
6 Micrastur semitorquatus Aves 495
7 Pandion haliaetus Aves 495
8 Myiozetetes similis Aves 494
9 Nyctidromus albicollis Aves 493
10 Tolmomyias sulphurescens Aves 493
# ℹ 196 more rows
[The most populated bird is Turdus rufiventris, find image below]