This document describes how we map the validated red list data for Flanders to Darwin Core. The source file for this document can be found here.
Load libraries:
# devtools::install_github(c("tazinho/snakecase"))
library(tidyverse) # To do data science
library(magrittr) # To use %<>% pipes
library(here) # To find files
library(janitor) # To clean input data
library(digest) # To generate hashes
library(rgbif) # To use GBIF services
library(snakecase) # To convert case of descriptions
Create a data frame input_data
from the source data:
input_data <- read_delim(here("data", "raw", "tblFlandersRedListsAll.tsv"), delim = "\t")
Filter on Validated taxa:
input_data %<>% filter(Validated == "Validated")
Number of records:
input_data %>% nrow()
## [1] 3063
Clean data somewhat:
input_data %<>%
remove_empty("rows") %>% # Remove empty rows
clean_names() # Have sensible (lowercase) column names
The scientific names contain trailing spaces:
input_data %<>% mutate(
speciesname_as_published = str_trim(speciesname_as_published),
speciesname_unique = str_trim(speciesname_unique)
)
Use the GBIF nameparser to retrieve nomenclatural information for the scientific names in the checklist:
parsed_names <- input_data %>%
distinct(speciesname_as_published) %>%
pull() %>% # Create vector from dataframe
parsenames() # An rgbif function
The nameparser function also provides information about the rank of the taxon (in rankmarker
). Here we join this information with our checklist. Cleaning these ranks will done in the Taxon Core mapping:
input_data %<>% left_join(
select(parsed_names, scientificname, rankmarker),
by = c("speciesname_as_published" = "scientificname")
)
Since the source data only includes codes for references, we load an additional file with more complete reference information:
references <- read_csv(here("data", "raw", "references.csv"))
Join source data with references:
input_data %<>% left_join(
references,
by = c("reference" = "reference", "taxonomic_group" = "taxonomic_group")
)
Show the number of taxa per red list and taxonomic group:
Show the number of taxa per kingdom and rank:
Preview data:
input_data %>% head()
taxon <- input_data
Map the data to Darwin Core Taxon.
taxon %<>% mutate(dwc_language = "en")
taxon %<>% mutate(dwc_license = "http://creativecommons.org/publicdomain/zero/1.0/")
taxon %<>% mutate(dwc_rightsHolder = "INBO")
taxon %<>% mutate(dwc_accessRights = "https://www.inbo.be/en/norms-data-use")
taxon %<>% mutate(dwc_datasetID = "https://doi.org/10.15468/8tk3tk")
taxon %<>% mutate(dwc_institutionCode = "INBO")
taxon %<>% mutate(dwc_datasetName = "Validated Red Lists of Flanders, Belgium")
taxon %<>% mutate(dwc_taxonID = unique_id)
Use the name as originally published on the checklist:
taxon %<>% mutate(dwc_scientificName = speciesname_as_published)
taxon %<>% mutate(dwc_kingdom = kingdom)
taxon %<>% mutate(dwc_phylum = phylum)
taxon %<>% mutate(dwc_class = class)
taxon %<>% mutate(dwc_order = order)
taxon %<>% mutate(dwc_family = family)
taxon %<>% mutate(dwc_genus = genus)
Inspect values:
taxon %>%
group_by(rankmarker) %>%
count()
Map values by recoding to the GBIF rank vocabulary:
taxon %<>% mutate(dwc_taxonRank = recode(rankmarker,
"sp." = "species",
"infrasp." = "infraspecificname",
"subsp." = "subspecies",
"var." = "variety",
.default = "",
.missing = ""
))
Inspect mapped values:
taxon %>%
group_by(rankmarker, dwc_taxonRank) %>%
count()
taxon %<>% mutate(dwc_vernacularName = speciesname_dutch)
taxon %<>% mutate(dwc_nomenclaturalCode = case_when(
kingdom == "Animalia" ~ "ICZN",
kingdom == "Plantae" ~ "ICBN"
))
Only keep the Darwin Core columns:
taxon %<>% select(starts_with("dwc_"))
Drop the dwc_
prefix:
colnames(taxon) <- str_replace(colnames(taxon), "dwc_", "")
Sort on taxonID
(to maintain some consistency between updates of the dataset):
taxon %<>% arrange(taxonID)
Preview data:
taxon %>% head()
Save to CSV:
write_csv(taxon, here("data", "processed", "validated", "taxon.csv"), na = "")
distribution <- input_data
Map the data to Species Distribution.
distribution %<>% mutate(dwc_taxonID = unique_id)
distribution %<>% mutate(dwc_locationID = "ISO_3166:BE-VLG")
distribution %<>% mutate(dwc_locality = "Flanders")
distribution %<>% mutate(dwc_countryCode = "BE")
Set to absent
for regionally extent species, otherwise present
:
distribution %<>% mutate(dwc_occurrenceStatus = recode(rlc,
"RE" = "absent",
.default = "present",
.missing = "present"
))
There are two red list category columns:
distribution %>%
group_by(rlc, rlc_as_published) %>%
count()
This will be mapped as follows:
rlc
→ threatStatus
: IUCN equivalent of Flemish status and according to expected vocabulary.rlc_as_published
→ occurrenceRemarks
: Flemish status as originally published in red list. Not according to vocabulary, but important include.distribution %<>% mutate(dwc_threatStatus = rlc)
distribution %<>% mutate(dwc_establishmentMeans = case_when(
rlc_as_published == "Niet-inheemse broedvogel" ~ "introduced",
TRUE ~ "native"
))
distribution %<>% mutate(dwc_eventDate = year_published)
The source for the distribution information is the red list:
distribution %<>% mutate(dwc_source = source_red_list)
distribution %<>% mutate(dwc_occurrenceRemarks = rlc_as_published)
Only keep the Darwin Core columns:
distribution %<>% select(starts_with("dwc_"))
Drop the dwc_
prefix:
colnames(distribution) <- str_replace(colnames(distribution), "dwc_", "")
Sort on taxonID
:
distribution %<>% arrange(taxonID)
Preview data:
distribution %>% head()
Save to CSV:
write_csv(distribution, here("data", "processed", "validated", "distribution.csv"), na = "" )
description_ext <- input_data
Gather description columns to rows:
description_ext %<>% gather(
key = type, value = description,
biome, biotope1, biotope2, lifespan, cuddliness, mobility, spine, nutrient_level,
na.rm = TRUE
)
Rename biotope1
and biotope2
to biotope
:
description_ext %<>% mutate(type = recode(type,
"biotope1" = "biotope",
"biotope2" = "biotope"
))
Inspect values:
description_ext %>%
select(type, description) %>%
group_by(type, description) %>%
count()
Convert descriptions from CamelCase
to lower case
:
description_ext %<>% mutate(
clean_description = str_to_lower(to_sentence_case(description)))
Inspect mapped values:
description_ext %>%
group_by(description, clean_description) %>%
count()
Map the data to Taxon Description.
description_ext %<>% mutate(dwc_taxonID = unique_id)
description_ext %<>% mutate(dwc_description = clean_description)
description_ext %<>% mutate(dwc_type = case_when(
type == "nutrient_level" ~ "nutrient level",
TRUE ~ type
))
The source for the life-history traits is not the red list, but a separate source:
description_ext %<>% mutate(dwc_source = source_for_traits)
description_ext %<>% mutate(dwc_language = "en")
Only keep the Darwin Core columns:
description_ext %<>% select(starts_with("dwc_"))
Drop the dwc_
prefix:
colnames(description_ext) <- str_replace(colnames(description_ext), "dwc_", "")
Sort on taxonID
:
description_ext %<>% arrange(taxonID)
Preview data:
description_ext %>% head()
Save to CSV:
write_csv(description_ext, here("data", "processed", "validated", "description.csv"), na = "" )