用 R 下载 GBIF 数据

R 程辑包 rgbif 可以非常方便地从 GBIF (Global Biodiversity Information Facility) 数据库中以物种、属、科等信息导出想要的数据。参见rgbif tutorial。

library(rgbif)
library(dplyr)
library(sf)
library(spData)
library(tmap)
library(purrr)
library(readr)
library(magrittr) # for %T>% pipe
library(rgbif) # for occ_download
library(taxize) # for get_gbifid

搜索某一物种的数据

根据物种学名搜索数据：

sq <- occ_search(scientificName = "Sinotaia quadrata")
names(sq)
# [1] "meta"      "hierarchy" "data"      "media"     "facets"

# 若你关心的是物种的地理分布，则可以导出其经纬度信息，即 data 的第 3 和第 4 列
sq_location <- sq$data[,c(1,2,3,4)] %>% 
               filter(decimalLongitude != "NA" | decimalLatitude != "NA") %>% 
               data.frame()
#          key                   scientificName decimalLatitude decimalLongitude
# 1 2242953256 Sinotaia quadrata (Benson, 1842)        36.38850        126.56625
# 2 2557790953 Sinotaia quadrata (Benson, 1842)       -31.32859        -64.46254
# 3 2005260509 Sinotaia quadrata (Benson, 1842)        25.10170        121.51041
# 4 1880618814 Sinotaia quadrata (Benson, 1842)        24.72333        121.72417
# 5 2005272605 Sinotaia quadrata (Benson, 1842)        24.65829        121.82159
# 6 2306370637 Sinotaia quadrata (Benson, 1842)        43.81000         11.03000

可以直接在 occ_search() 函数中定义 field ，从而输出所需的列值:

sq2 <- occ_search(scientificName = "Sinotaia quadrata", fields = c("key", "scientificName", "decimalLongitude", "decimalLatitude"))
head(sq2$data)
# A tibble: 6 x 4
#   key        scientificName                   decimalLongitude decimalLatitude
#   <chr>      <chr>                                       <dbl>           <dbl>
# 1 2242953256 Sinotaia quadrata (Benson, 1842)            127.             36.4
# 2 2557790953 Sinotaia quadrata (Benson, 1842)            -64.5           -31.3
# 3 2005260509 Sinotaia quadrata (Benson, 1842)            122.             25.1
# 4 1880618814 Sinotaia quadrata (Benson, 1842)            122.             24.7
# 5 2005272605 Sinotaia quadrata (Benson, 1842)            122.             24.7
# 6 2244501338 Sinotaia quadrata (Benson, 1842)             NA              NA

搜索多个物种的数据

当有很多物种需要搜索时，无需单个单个搜索。只需要做一个物种列表即可同时搜索：

splist <- c('Sinotaia quadrata', 'Sinotaia aeruginosa', 'Sinotaia purificata')
keys <- sapply(splist, function(x) name_suggest(x)$key[1], USE.NAMES = FALSE)
sps <- occ_search(taxonKey = keys)
# 这种方法有时会没有想要的经纬度信息。

此外，还可以使用你的 GBIF 账户和密码，在 R 中直接将多个物种的数据下载下来。

user <- "user_name" # 你在 GBIF 注册账户的用户名
pwd <- "password"   # GBIF 账户的密码
email <- "Email"    # GBIF 账户关联邮箱

# List species names that you want to search
# Make a species list by hand

file_url <- read.csv("species_list.csv", header=TRUE, sep=";")

# Get taxon keys from gbif based on species names

gbif_taxon_keys <- file_url[,1] %>%  # the first column is species names
taxize::get_gbifid_(method="backbone") %>%  # match names to the GBIF backbone to get taxonkeys
imap(~ .x %>% mutate(original_sciname = .y)) %>%  # add original name back into data.frame
bind_rows() %T>% # combine all data.frames into one
readr::write_tsv(path = "all_matches.tsv") %>% # save as side effect for you to inspect if you want
filter(matchtype == "EXACT" & status == "ACCEPTED") %>% # get only accepted and matched names
filter(kingdom == "Plantae") %>% # remove anything that might have matched to a non-plant
pull(usagekey) # get the gbif taxonkeys

# Searched data that have coordinates

occ_download(
pred_in("taxonKey", gbif_taxon_keys),
# pred_in("basisOfRecord", c('PRESERVED_SPECIMEN','HUMAN_OBSERVATION','OBSERVATION','MACHINE_OBSERVATION')),
# pred_gt("elevation", 5000),
# pred("country", "US"),
pred("hasCoordinate", TRUE),
# pred("hasGeospatialIssue", FALSE),
# pred_gte("year", 1990),
format = "SIMPLE_CSV",
user=user,pwd=pwd,email=email
)

这时，登录你的 GBIF 账户，会发现在 Download 目录下已有当前正在下载的任务，任务完成后点击 download，即可下载 .csv 文件。

上述方法还可以有更简便的方式，即直接根据物种名称，将其 GBIF 中的 taxon Key 输出出来，根据 Key 下载数据。

# List species/genus/family names

list <- c("Sinotaia", "Heterogen", "Mekongia", "Filopaludina", 
    "Margarya", "Neothauma", "Viviparus", "Taia", "Torotaia", "Idiopoma", 
    "Bellamya", "Campeloma", "Angulyagra", "Trochopaludina", 
    "Cipangopaludina", "Rivularia", "Anularya", "Lioplacodes") %>% data.frame()

# Get GBIF Keys

n <- nrow(list)
list.keys <- NULL
for (i in 1:n){
	name <- name_suggest(q = list[i,], rank = "genus")
	tem.keys <- data.frame(name$key, list[i,])
	list.keys <- rbind(list.keys, tem.keys)
}
list.keys

# Download data

occ_download(
pred_in("taxonKey", list.keys[,1]),
# pred_in("basisOfRecord", c('PRESERVED_SPECIMEN','HUMAN_OBSERVATION','OBSERVATION','MACHINE_OBSERVATION')),
# pred_gt("elevation", 5000),
# pred("country", "US"),
pred("hasCoordinate", TRUE),
# pred("hasGeospatialIssue", FALSE),
# pred_gte("year", 1990),
format = "SIMPLE_CSV",
user=user,pwd=pwd,email=email
)

数据使用

最常见的数据使用就是自定义绘制物种分布地图。根据每个个体的经纬度数据，制成 GIS 点向量，即可在各种矢量和栅格地图中绘制分布点。

# Make geometry vector of points

sqn <- nrow(sq_location)
sq_location_df <- NULL
for (i in 1:sqn){
 	x <- sq_location[i,]$decimalLongitude
 	y <- sq_location[i,]$decimalLatitude
 	n <- sq_location[i,]$key
 	tem <- st_point(c(x, y)) %>% st_sfc(crs="+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0") %>% st_sf()
 	sq_location_df <- rbind(sq_location_df, tem)
}
sq_location_df


# Plot the map

tm_shape(world) + tm_polygons(col="white") + 
    tm_shape(sq_location_df) + tm_symbols(size=0.05, col="red", border.col="red")