For the capstone project of my data analytics certificate, I decided to analyze trends in bee populations. Unfortunately, I’m not a biologist and I don’t know how to extrapolate species abundance from occurrences data; as well as a bunch of other factors that I’m definitely overlooking. So the project on my part was cut short. Regardless, I’m putting it here in case actual scientists find it and maybe this work will not be useless lol.

I downloaded the data from the Global Biodiversity Information Facility. A citation with the doi can be found at the bottom of this page.

Load libraries

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(rgbif)
library(DBI)
library(RPostgres)
library(maps)

## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map

Dataframe

occurrencesDataframe <-
  read_csv(
    "~/Documents/projects/pages/files/occurrencesDataframe.csv"
    ) %>%
  mutate(gbifid_ = as.numeric(gbifid_), date_ = as_date(date_))

## Rows: 2287581 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): family_, scientificname_, state_, county_
## dbl  (4): gbifid_, count_, latitude_, longitude_
## date (1): date_
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(occurrencesDataframe)

## tibble [2,287,581 × 9] (S3: tbl_df/tbl/data.frame)
##  $ gbifid_        : num [1:2287581] 6.58e+08 1.46e+09 3.10e+09 3.10e+09 1.46e+09 ...
##  $ family_        : chr [1:2287581] "Apidae" "Halictidae" "Megachilidae" "Andrenidae" ...
##  $ scientificname_: chr [1:2287581] "Bombus rufocinctus Cresson, 1863" "Halictidae" "Megachilidae" "Andrenidae" ...
##  $ count_         : num [1:2287581] 1 1 1 1 1 1 1 1 1 1 ...
##  $ state_         : chr [1:2287581] "New Mexico" "Utah" "Utah" "Utah" ...
##  $ county_        : chr [1:2287581] "Lincoln" "Salt Lake" "Salt Lake" "Salt Lake" ...
##  $ latitude_      : num [1:2287581] 33.6 40.8 40.7 40.7 40.8 ...
##  $ longitude_     : num [1:2287581] -106 -112 -112 -112 -112 ...
##  $ date_          : Date[1:2287581], format: "1923-01-01" "1923-01-01" ...

Mutate count_ field to replace nulls with 1 and floor date_ to month

occurrencesDataframe <- occurrencesDataframe %>%
  mutate(count_ = replace_na(count_, 1),)

occurrencesDataframe$date_ <- occurrencesDataframe$date_ %>%
  floor_date(unit = "month")

Get state_ and county_ from coordinates (latitude_ and longitude_)

occurrencesDataframe <- occurrencesDataframe[!is.na(occurrencesDataframe$latitude_ | occurrencesDataframe$longitude_),]
occurrencesDataframe$locality_ <- map.where(database  = "county", occurrencesDataframe$longitude_, occurrencesDataframe$latitude_)
# occurrencesDataframe <- occurrencesDataframe %>%
#   relocate(locality_, .after = count_) %>%
#   separate_wider_delim(locality_, delim = ",", names = c("state_", "county_"))

occurrencesDataframe$state_ <- str_to_title(occurrencesDataframe$state_)
occurrencesDataframe$county_ <- str_to_title(occurrencesDataframe$county_)

Establish occurrences location

#locations <- str_to_title(readline(prompt = "Please enter location: "))
occurrencesTibble <- occurrencesDataframe %>%
  filter(occurrencesDataframe$state_ == "North Carolina")

tibble(occurrencesTibble)

## # A tibble: 37,797 × 10
##       gbifid_ family_ scientificname_ count_ state_ county_ latitude_ longitude_
##         <dbl> <chr>   <chr>            <dbl> <chr>  <chr>       <dbl>      <dbl>
##  1  658111854 Collet… Colletes valid…      1 North… Pender       34.7      -78.0
##  2  658113475 Collet… Colletes valid…      1 North… Pender       34.7      -78.0
##  3  658113474 Collet… Colletes valid…      1 North… Pender       34.7      -78.0
##  4 3756309144 Halict… Lasioglossum i…      1 North… Wake         35.8      -78.6
##  5 3756301154 Megach… Osmia lignaria…      1 North… Wake         35.8      -78.6
##  6  657679441 Andren… Andrena illini…      1 North… Swain        35.4      -83.4
##  7 3756336765 Andren… Andrena Fabric…      1 North… Wake         35.8      -78.6
##  8  657716497 Andren… Andrena barbar…      1 North… Swain        35.4      -83.4
##  9 3756306834 Andren… Andrena bisali…      1 North… Wake         35.8      -78.6
## 10 3756321788 Andren… Andrena Fabric…      1 North… Wake         35.8      -78.6
## # ℹ 37,787 more rows
## # ℹ 2 more variables: date_ <date>, locality_ <chr>

Get genus_ and species_ (genus and epithet) from scientificname_

names <- tibble(name_parse(scientificname = occurrencesTibble$scientificname_, curlopts = list()))

names <-
  tibble(
    epithet_ = names$specificepithet,
    genus_ = names$genusorabove,
    species_ = names$canonicalname
  )

names <- replace_na(names, list(genus_ = "Gen.", epithet_ = "sp."))

occurrencesTibble$genus_ <- names$genus_
occurrencesTibble$species_ <- names$epithet_

occurrencesTibble$species_[occurrencesTibble$species_ != "sp."] <-
  paste(
    str_sub(occurrencesTibble$genus_[occurrencesTibble$species_ != "sp."], start = 1, end = 1),
    occurrencesTibble$species_[occurrencesTibble$species_ != "sp."],
    sep = ". "
    )

occurrencesTibble$species_[occurrencesTibble$species_ == "sp."] <-
  paste(
    occurrencesTibble$genus_[occurrencesTibble$species_ == "sp."],
    occurrencesTibble$species_[occurrencesTibble$species_ == "sp."],
    sep = " "
    )

occurrencesTibble <- occurrencesTibble %>%
  relocate(genus_, .after = family_)

occurrencesTibble <- occurrencesTibble %>%
  relocate(species_, .after = genus_)

Citations

GBIF.org (16 January 2024) GBIF Occurrence Download https://doi.org/10.15468/dl.yg8u6y

Conserve the Bees

Citations