For the capstone project of my data analytics certificate, I decided to analyze trends in bee populations. Unfortunately, I’m not a biologist and I don’t know how to extrapolate species abundance from occurrences data; as well as a bunch of other factors that I’m definitely overlooking. So the project on my part was cut short. Regardless, I’m putting it here in case actual scientists find it and maybe this work will not be useless lol.
I downloaded the data from the Global Biodiversity Information Facility. A citation with the doi can be found at the bottom of this page.
Load libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rgbif)
library(DBI)
library(RPostgres)
library(maps)
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:purrr':
##
## map
Dataframe
occurrencesDataframe <-
read_csv(
"~/Documents/projects/pages/files/occurrencesDataframe.csv"
) %>%
mutate(gbifid_ = as.numeric(gbifid_), date_ = as_date(date_))
## Rows: 2287581 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): family_, scientificname_, state_, county_
## dbl (4): gbifid_, count_, latitude_, longitude_
## date (1): date_
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(occurrencesDataframe)
## tibble [2,287,581 × 9] (S3: tbl_df/tbl/data.frame)
## $ gbifid_ : num [1:2287581] 6.58e+08 1.46e+09 3.10e+09 3.10e+09 1.46e+09 ...
## $ family_ : chr [1:2287581] "Apidae" "Halictidae" "Megachilidae" "Andrenidae" ...
## $ scientificname_: chr [1:2287581] "Bombus rufocinctus Cresson, 1863" "Halictidae" "Megachilidae" "Andrenidae" ...
## $ count_ : num [1:2287581] 1 1 1 1 1 1 1 1 1 1 ...
## $ state_ : chr [1:2287581] "New Mexico" "Utah" "Utah" "Utah" ...
## $ county_ : chr [1:2287581] "Lincoln" "Salt Lake" "Salt Lake" "Salt Lake" ...
## $ latitude_ : num [1:2287581] 33.6 40.8 40.7 40.7 40.8 ...
## $ longitude_ : num [1:2287581] -106 -112 -112 -112 -112 ...
## $ date_ : Date[1:2287581], format: "1923-01-01" "1923-01-01" ...
Mutate count_ field to replace nulls with 1 and floor date_ to month
occurrencesDataframe <- occurrencesDataframe %>%
mutate(count_ = replace_na(count_, 1),)
occurrencesDataframe$date_ <- occurrencesDataframe$date_ %>%
floor_date(unit = "month")
Get state_ and county_ from coordinates (latitude_ and longitude_)
occurrencesDataframe <- occurrencesDataframe[!is.na(occurrencesDataframe$latitude_ | occurrencesDataframe$longitude_),]
occurrencesDataframe$locality_ <- map.where(database = "county", occurrencesDataframe$longitude_, occurrencesDataframe$latitude_)
# occurrencesDataframe <- occurrencesDataframe %>%
# relocate(locality_, .after = count_) %>%
# separate_wider_delim(locality_, delim = ",", names = c("state_", "county_"))
occurrencesDataframe$state_ <- str_to_title(occurrencesDataframe$state_)
occurrencesDataframe$county_ <- str_to_title(occurrencesDataframe$county_)
Establish occurrences location
#locations <- str_to_title(readline(prompt = "Please enter location: "))
occurrencesTibble <- occurrencesDataframe %>%
filter(occurrencesDataframe$state_ == "North Carolina")
tibble(occurrencesTibble)
## # A tibble: 37,797 × 10
## gbifid_ family_ scientificname_ count_ state_ county_ latitude_ longitude_
## <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl> <dbl>
## 1 658111854 Collet… Colletes valid… 1 North… Pender 34.7 -78.0
## 2 658113475 Collet… Colletes valid… 1 North… Pender 34.7 -78.0
## 3 658113474 Collet… Colletes valid… 1 North… Pender 34.7 -78.0
## 4 3756309144 Halict… Lasioglossum i… 1 North… Wake 35.8 -78.6
## 5 3756301154 Megach… Osmia lignaria… 1 North… Wake 35.8 -78.6
## 6 657679441 Andren… Andrena illini… 1 North… Swain 35.4 -83.4
## 7 3756336765 Andren… Andrena Fabric… 1 North… Wake 35.8 -78.6
## 8 657716497 Andren… Andrena barbar… 1 North… Swain 35.4 -83.4
## 9 3756306834 Andren… Andrena bisali… 1 North… Wake 35.8 -78.6
## 10 3756321788 Andren… Andrena Fabric… 1 North… Wake 35.8 -78.6
## # ℹ 37,787 more rows
## # ℹ 2 more variables: date_ <date>, locality_ <chr>
Get genus_ and species_ (genus and epithet) from scientificname_
names <- tibble(name_parse(scientificname = occurrencesTibble$scientificname_, curlopts = list()))
names <-
tibble(
epithet_ = names$specificepithet,
genus_ = names$genusorabove,
species_ = names$canonicalname
)
names <- replace_na(names, list(genus_ = "Gen.", epithet_ = "sp."))
occurrencesTibble$genus_ <- names$genus_
occurrencesTibble$species_ <- names$epithet_
occurrencesTibble$species_[occurrencesTibble$species_ != "sp."] <-
paste(
str_sub(occurrencesTibble$genus_[occurrencesTibble$species_ != "sp."], start = 1, end = 1),
occurrencesTibble$species_[occurrencesTibble$species_ != "sp."],
sep = ". "
)
occurrencesTibble$species_[occurrencesTibble$species_ == "sp."] <-
paste(
occurrencesTibble$genus_[occurrencesTibble$species_ == "sp."],
occurrencesTibble$species_[occurrencesTibble$species_ == "sp."],
sep = " "
)
occurrencesTibble <- occurrencesTibble %>%
relocate(genus_, .after = family_)
occurrencesTibble <- occurrencesTibble %>%
relocate(species_, .after = genus_)
Citations
GBIF.org (16 January 2024) GBIF Occurrence Download https://doi.org/10.15468/dl.yg8u6y