#devtools::install_github("pygmyperch/melfuR")
#BiocManager::install('qvalue')
library(adegenet)
library(LEA)
library(vegan)
library(fmsb)
library(psych)
library(dartRverse)
library(melfuR)
library(sdmpredictors)
library(sf)
library(raster)
library(robust)
library(qvalue)
library(mlbench)
library(caret)
library(RAINBOWR)
library(qqman)
library(randomForest)
7 Natural Selection
Session Presenters
Required packages
make sure you have the packages installed, see Install dartRverse
GEA analysis of SNP and environmental data using RDA
First lets load in some functions we will be using.
source("utils.R")
1. Data preparation
# load genlight object
load("data/Mf5000_gl.RData")
Mf5000_gl
/// GENLIGHT OBJECT /////////
// 249 genotypes, 5,000 binary SNPs, size: 1.1 Mb
12917 (1.04 %) missing data
// Basic content
@gen: list of 249 SNPbin
@ploidy: ploidy of each individual (range: 2-2)
// Optional content
@ind.names: 249 individual labels
@loc.names: 5000 locus labels
@pop: population of each individual (group size range: 9-30)
@other: a list containing: X
@pop Mf5000_gl
[1] MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR
[19] MBR MBR MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC
[37] MDC MDC MDC MDC MDC MDC GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL
[55] GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL
[73] WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK BEN BEN
[91] BEN BEN BEN BEN BEN BEN BEN BEN BEN BEN BEN BEN BOG BOG BOG BOG BOG BOG
[109] BOG BOG BOG BOG BOG BOG BOG BOG BOG BOG PEL PEL PEL PEL PEL PEL PEL PEL
[127] PEL GWY GWY GWY GWY GWY GWY GWY GWY GWY GWY GWY GWY DUM DUM DUM DUM DUM
[145] DUM DUM DUM DUM DUM DUM DUM DUM DUM MIB MIB MIB MIB MIB MIB MIB MIB MIB
[163] MIB MIB MIB MIB MIB MIB MIB MIB MIB MIB MIB STG STG STG STG STG STG STG
[181] STG STG STG STG STG STG STG STG STG STG STG STG STG OAK OAK OAK OAK OAK
[199] OAK OAK OAK OAK OAK OAK OAK OAK OAK OAK OAK OAK OAK WAR WAR WAR WAR WAR
[217] WAR WAR WAR WAR WAR WAR WAR WAR WAR WAR WAR WAR WAR WAR WAR KIL KIL KIL
[235] KIL KIL KIL KIL KIL KIL KIL KIL KIL KIL KIL KIL KIL KIL KIL
Levels: BEN BOG DUM GGL GWY KIL MBR MDC MIB OAK PEL STG WAK WAR
## What do you notice about the factor levels?
# Factors in R can make you question your life choices like nothing else...
# re-order the pop levels to match the order of individuals in your data
@pop <- factor(Mf5000_gl@pop, levels = as.character(unique(Mf5000_gl@pop)))
Mf5000_gl@pop Mf5000_gl
[1] MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR MBR
[19] MBR MBR MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC MDC
[37] MDC MDC MDC MDC MDC MDC GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL
[55] GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL GGL
[73] WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK WAK BEN BEN
[91] BEN BEN BEN BEN BEN BEN BEN BEN BEN BEN BEN BEN BOG BOG BOG BOG BOG BOG
[109] BOG BOG BOG BOG BOG BOG BOG BOG BOG BOG PEL PEL PEL PEL PEL PEL PEL PEL
[127] PEL GWY GWY GWY GWY GWY GWY GWY GWY GWY GWY GWY GWY DUM DUM DUM DUM DUM
[145] DUM DUM DUM DUM DUM DUM DUM DUM DUM MIB MIB MIB MIB MIB MIB MIB MIB MIB
[163] MIB MIB MIB MIB MIB MIB MIB MIB MIB MIB MIB STG STG STG STG STG STG STG
[181] STG STG STG STG STG STG STG STG STG STG STG STG STG OAK OAK OAK OAK OAK
[199] OAK OAK OAK OAK OAK OAK OAK OAK OAK OAK OAK OAK OAK WAR WAR WAR WAR WAR
[217] WAR WAR WAR WAR WAR WAR WAR WAR WAR WAR WAR WAR WAR WAR WAR KIL KIL KIL
[235] KIL KIL KIL KIL KIL KIL KIL KIL KIL KIL KIL KIL KIL KIL KIL
Levels: MBR MDC GGL WAK BEN BOG PEL GWY DUM MIB STG OAK WAR KIL
# convert to genind
<- gl2gi(Mf5000_gl) Mf5000.genind
Starting gl2gi
Processing genlight object with SNP data
Matrix converted.. Prepare genind object...
Completed: gl2gi
Mf5000.genind
/// GENIND OBJECT /////////
// 249 individuals; 5,000 loci; 10,000 alleles; size: 12.2 Mb
// Basic content
@tab: 249 x 10000 matrix of allele counts
@loc.n.all: number of alleles per locus (range: 2-2)
@loc.fac: locus factor for the 10000 columns of @tab
@all.names: list of allele names for each locus
@ploidy: ploidy of each individual (range: 2-2)
@type: codom
@call: df2genind(X = xx[, ], sep = "/", ncode = 1, ind.names = x@ind.names,
pop = x@pop, NA.char = "-", ploidy = 2)
// Optional content
@pop: population of each individual (group size range: 9-30)
@other: a list containing: X
# Ok we have our genotypes
Ordination analyses cannot handle missing data…
How can we impute missing data?
remove loci with missing data?
remove individuals with missing data?
impute missing data, ok how?
most common genotype across all data?
most common genotype per site/population/other?
based on population allele frequencies characterised using snmf (admixture)?
# ADMIXTURE results most likely 6 pops
<- melfuR::impute.data(Mf5000.genind, K = 6 ) imputed.gi
Total missing data = 1.04 %
The project is saved into :
dat.snmfProject
To load the project, use:
project = load.snmfProject("dat.snmfProject")
To remove the project, use:
remove.snmfProject("dat.snmfProject")
[1] 42
[1] "*************************************"
[1] "* create.dataset *"
[1] "*************************************"
summary of the options:
[...]
Missing genotype imputation for K = 6
Missing genotype imputation for run = 3
Results are written in the file: ./dat.lfmm_imputed.lfmm
Total missing data was = 1.04 %
Total missing data now = 0 %
# re-order each genotype as major/minor allele
# (generally no real reason to do this... but it's neat and might be useful for something)
<- sort_alleles(imputed.gi)
imputed.sorted.gi
# check results
@tab[1:10,1:6] imputed.gi
SNP_1.C SNP_1.G SNP_2.A SNP_2.T SNP_3.T SNP_3.A
MBR_10 1 1 1 1 2 0
MBR_11 1 1 1 1 2 0
MBR_12 1 1 2 0 2 0
MBR_14 0 2 2 0 2 0
MBR_15 0 2 1 1 2 0
MBR_16 0 2 2 0 2 0
MBR_17 0 2 0 2 2 0
MBR_18 0 2 1 1 2 0
MBR_19 1 1 2 0 2 0
MBR_1 0 2 1 1 2 0
@tab[1:10,1:6] imputed.sorted.gi
SNP_1.G SNP_1.C SNP_2.A SNP_2.T SNP_3.T SNP_3.A
MBR_10 1 1 1 1 2 0
MBR_11 1 1 1 1 2 0
MBR_12 1 1 2 0 2 0
MBR_14 2 0 2 0 2 0
MBR_15 2 0 1 1 2 0
MBR_16 2 0 2 0 2 0
MBR_17 2 0 0 2 2 0
MBR_18 2 0 1 1 2 0
MBR_19 1 1 2 0 2 0
MBR_1 2 0 1 1 2 0
# The allele counts for SNP_1 were ordered C/T before sorting, now they are T/C
# the sort_alleles function also has an optional 'pops' argument where you can specify
# a population to use as the reference if you want to analyse relative to some focal pop
unlink(c('dat.geno', 'dat.lfmm', 'dat.snmfProject', 'dat.snmf', 'dat.lfmm_imputed.lfmm',
'individual_env_data.csv'), recursive = T)
The sort_alleles function also has an optional ‘pops’ argument where you can specify a population to use as the reference if you want to analyse relative to some focal pop.
Format SNP data for ordination analysis in vegan
So we have our genotypes and have filled in the missing data. We now need to format the data for use in the RDA. We can do the analysis at an individual level using a matrix of allele counts per locus, or at a population level using population allele frequencies. Both of those options are easy to format from a genind object.
Individual-based
# for individual based analyses
# get allele counts
<- imputed.sorted.gi@tab
alleles
# get genotypes (counts of reference allele) and clean up locus names
<- alleles[,seq(1,ncol(alleles),2)]
snps colnames(snps) <- locNames(imputed.sorted.gi)
1:10,1:10] snps[
SNP_1 SNP_2 SNP_3 SNP_4 SNP_5 SNP_6 SNP_7 SNP_8 SNP_9 SNP_10
MBR_10 1 1 2 2 2 2 1 1 2 1
MBR_11 1 1 2 2 2 2 2 2 2 0
MBR_12 1 2 2 2 2 1 2 2 2 1
MBR_14 2 2 2 2 2 2 1 1 2 2
MBR_15 2 1 2 2 2 2 1 1 2 2
MBR_16 2 2 2 2 2 2 2 2 2 2
MBR_17 2 0 2 2 2 2 0 2 2 2
MBR_18 2 1 2 2 2 1 1 2 2 0
MBR_19 1 2 2 2 2 1 2 2 2 2
MBR_1 2 1 2 2 1 2 2 2 2 2
# Alternative: impute missing data with most common genotype across all data
# alleles <- Mf5000.genind@tab
# snps <- alleles[,seq(1,ncol(alleles),2)]
# colnames(snps) <- locNames(Mf5000.genind)
# snps <- apply(snps, 2, function(x) replace(x, is.na(x), as.numeric(names(which.max(table(x))))))
# check total % missing data
# (sum(is.na(snps)))/(dim(snps)[1]*dim(snps)[2])*100
Population-based
# for population based analyses
# get pop allele frequencies
<- genind2genpop(imputed.sorted.gi) gp
Converting data from a genind to a genpop object...
...done.
<- makefreq(gp) AF
Finding allelic frequencies from a genpop object...
...done.
# drop one (redundant) allele per locus
<- AF[,seq(1,ncol(AF),2)]
AF colnames(AF) <- locNames(gp)
rownames(AF) <- levels(imputed.sorted.gi@pop)
1:14,1:6] AF[
SNP_1 SNP_2 SNP_3 SNP_4 SNP_5 SNP_6
MBR 0.87500000 0.8250000 0.9750000 1.0000000 0.9750000 0.8250000
MDC 0.70454545 0.7045455 0.8409091 0.9545455 0.8863636 0.8181818
GGL 0.71666667 0.7833333 0.9000000 0.9166667 0.8500000 0.8000000
WAK 0.93750000 1.0000000 1.0000000 1.0000000 1.0000000 0.3125000
BEN 0.92857143 1.0000000 1.0000000 1.0000000 1.0000000 0.6071429
BOG 0.75000000 0.8125000 0.7812500 1.0000000 0.7812500 0.9062500
PEL 0.83333333 0.7222222 0.9444444 0.9444444 0.8888889 1.0000000
GWY 0.75000000 0.9166667 1.0000000 1.0000000 1.0000000 0.8333333
DUM 0.82142857 1.0000000 0.9285714 0.9285714 1.0000000 0.8571429
MIB 0.92500000 0.7750000 0.9250000 0.8250000 0.9750000 0.8500000
STG 0.97500000 0.8750000 0.6500000 0.8000000 1.0000000 0.8000000
OAK 0.02777778 0.6388889 0.6388889 0.6666667 0.3333333 1.0000000
WAR 0.20000000 0.9000000 0.7000000 0.5750000 0.2750000 1.0000000
KIL 0.25000000 0.7777778 0.5277778 0.7222222 0.3055556 1.0000000
Get environmental data
Ok our SNP data are formatted and ready to go Now we need some environmental data…
What are the variables you want to use? Considerations: Prior/expert knowledge, existing hypotheses, species distribution models (SDMs), most important variables Data availability? Think about surrogates/proxies if specific data not available, raw variables, PCA, other transformations?
In this case we are going to keep it simple and just use a few WorldClim variables Bio01 (annual mean temperature), Bio05 (temperature in hottest month), Bio15 (rainfall seasonality) and Bio19 (rainfall in coldest quarter)
# set the data directory and extend the response timeout (sometimes the database can be slow to respond)
options(sdmpredictors_datadir="data/spatial_data")
options(timeout = max(300, getOption("timeout")))
# Explore environmental layers
::list_layers("WorldClim") sdmpredictors
dataset_code layer_code name
1 WorldClim WC_alt Altitude
2 WorldClim WC_bio1 Annual mean temperature
3 WorldClim WC_bio2 Mean diurnal temperature range
4 WorldClim WC_bio3 Isothermality
5 WorldClim WC_bio4 Temperature seasonality
6 WorldClim WC_bio5 Maximum temperature
7 WorldClim WC_bio6 Minimum temperature
8 WorldClim WC_bio7 Annual temperature range
9 WorldClim WC_bio8 Mean temperature of wettest quarter
10 WorldClim WC_bio9 Mean temperature of driest quarter
11 WorldClim WC_bio10 Mean temperature of warmest quarter
12 WorldClim WC_bio11 Mean temperature of coldest quarter
13 WorldClim WC_bio12 Annual precipitation
14 WorldClim WC_bio13 Precipitation of wettest month
15 WorldClim WC_bio14 Precipitation of driest month
16 WorldClim WC_bio15 Precipitation seasonality
17 WorldClim WC_bio16 Precipitation of wettest quarter
18 WorldClim WC_bio17 Precipitation of driest quarter
19 WorldClim WC_bio18 Precipitation of warmest quarter
[...]
# Explore environmental layers
<- as.data.frame(sdmpredictors::list_layers("WorldClim"))
layers write.csv(layers, "./data/WorldClim.csv")
# Download specific layers to the datadir
# Bio01 (annual mean temperature), Bio05 (temperature in hottest month), Bio15 (rainfall seasonality) and Bio19 (rainfall in coldest quarter)
<- load_layers(c("WC_bio1", "WC_bio5", "WC_bio15", "WC_bio19"))
WC plot(WC)
# Crop rasters to study area and align CRS
# get Murray-Darling Basin polygon
<- st_read("data/spatial_data/MDB_polygon/MDB.shp") mdb
Reading layer `MDB' from data source
`/home/gruber/kioloa/data/spatial_data/MDB_polygon/MDB.shp'
using driver `ESRI Shapefile'
Simple feature collection with 1 feature and 18 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 138.57 ymin: -37.68 xmax: 152.49 ymax: -24.585
Geodetic CRS: GDA94
# set CRS
<- st_transform(mdb, 4326)
mdb
# convert to sf format
<- as(mdb, "Spatial")
mdb
# crop WorldClim rasters to MDB extent, then mask pixels outside of the MDB
<- crop(WC, extent(mdb))
ENV <- mask(ENV, mdb)
ENV
# get sampling sites, format as sf
<- read.csv("data/Mf_xy.csv", header = TRUE)
sites <- st_as_sf(sites, coords = c("X", "Y"), crs = 4326)
sites_sf
# Generate a nice color ramp and plot the rasters
= colorRampPalette(c("#5E85B8","#EDF0C0","#C13127"))
my.colors
# include other spatial data that you might want to show
<- st_read("data/spatial_data/Mf_streams/Mf_streams.shp") rivers
Reading layer `Mf_streams' from data source
`/home/gruber/kioloa/data/spatial_data/Mf_streams/Mf_streams.shp'
using driver `ESRI Shapefile'
Simple feature collection with 2826 features and 31 fields
Geometry type: LINESTRING
Dimension: XY
Bounding box: xmin: 138.7888 ymin: -36.52625 xmax: 152.2712 ymax: -26.75375
Geodetic CRS: GDA94
<- st_transform(rivers, 4326)
rivers <- as(rivers, "Spatial")
rivers
# plot a layer
plot(ENV$WC_bio1, col=my.colors(100), main=names(ENV$WC_bio1))
lines(rivers, col="blue", lwd=0.3)
text(st_coordinates(sites_sf)[,1], st_coordinates(sites_sf)[,2], labels=sites_sf$site, cex=0.5)
Save PDF
# format nicely and export as pdf
pdf("./images/env_rasters.pdf")
{for(i in 1:nlayers(ENV)) {
<- ENV[[i]]
rasterLayer
# skew the colour ramp to improve visualisation (not necessary, but nice for data like this)
<- quantile(values(rasterLayer), probs = 0.05, na.rm = TRUE)
p95 <- c(minValue(rasterLayer), seq(p95, maxValue(rasterLayer), length.out = 50))
breaks <- unique(c(seq(minValue(rasterLayer), p95, length.out = 10), breaks))
breaks <- my.colors(length(breaks) - 1)
color.palette <- if(i < 4) color.palette else rev(color.palette)
layerColors
# tidy up the values displayed in the legend
<- c(min(breaks), quantile(breaks, probs = c(0.25, 0.5, 0.75)), max(breaks))
simplifiedBreaks
# plot the rasters
plot(rasterLayer, breaks=breaks, col=layerColors, main=names(rasterLayer),
axis.args=list(at=simplifiedBreaks, labels=as.character(round(simplifiedBreaks))),
legend.args=list(text='', side=3, line=2, at=simplifiedBreaks))
lines(rivers, col="blue", lwd=0.3, labels(rivers$Name))
text(st_coordinates(sites_sf)[,1], st_coordinates(sites_sf)[,2], labels=sites_sf$site, cex=0.5)
}dev.off()}
Extract site environmental data
# and finally extract the environmental data for your sample sites
<- data.frame(site = sites_sf$site,
env.site X = st_coordinates(sites_sf)[,1],
Y = st_coordinates(sites_sf)[,2],
omegaX = sites_sf$omegaX,
omegaY = sites_sf$omegaY)
<- as.data.frame(raster::extract(ENV,st_coordinates(sites_sf)))
env.dat <- cbind(env.site, env.dat) env.site
Have a look at the env.site data frame This is where we will keep all of the metadata for use in the analyses Keep an eye out, we are going to add a few more columns to it and eventually munge it into individual-level metadata
# let's also just do a sanity check to make sure the genomic and environmental data are in the same order
stopifnot(all(env.site$site == rownames(AF)))
PCA
# to get a feel for the data we can run quick PCA using the rda function in vegan
<- rda(AF)
pc plot(pc)
Ok, not really that impressive… let’s set some plotting variables to make it easier to interpret
# set colours, markers and labels for plotting and add to env.site df
$cols <- c('darkturquoise', 'darkturquoise', 'darkturquoise', 'limegreen', 'limegreen', 'darkorange1', 'slategrey', 'slategrey', 'slategrey',
env.site'slategrey', 'slategrey', 'dodgerblue3', 'gold', 'gold')
$pch <- c(15, 16, 17, 17, 15, 15, 16, 15, 18, 16, 16, 8, 16, 16)
env.site
# generate labels for each axis
<- paste0("PC1 (", paste(round((pc$CA$eig[1]/pc$tot.chi*100),2)),"%)")
x.lab <- paste0("PC2 (", paste(round((pc$CA$eig[2]/pc$tot.chi*100),2)),"%)") y.lab
Plot PCA
# plot PCA
pdf(file = "./images/Mf_PCA.pdf", height = 6, width = 6)
{<- plot(pc, choices = c(1, 2), type = "n", xlab=x.lab, ylab=y.lab, cex.lab=1)
pcaplot with(env.site, points(pc, display = "sites", col = env.site$cols, pch = env.site$pch, cex=1.5, bg = env.site$cols))
legend("topleft", legend = env.site$site, col=env.site$cols, pch=env.site$pch, pt.cex=1, cex=0.50, xpd=1, box.lty = 0, bg= "transparent")
dev.off()
}
## 2. Variable filtering
# extract predictor variables to matrix
<- env.site[ , c("WC_bio1", "WC_bio5", "WC_bio15", "WC_bio19")]
env_var
# check for obvious correlations between variables
pairs.panels(env_var, scale=T, lm = TRUE)
## reduce variance associated with covarying variables using variance inflation factor analyses
# run procedure to remove variables with the highest VIF one at a time until all remaining variables are below your threshold
<-vif_func(in_frame=env_var,thresh=2,trace=T) keep.env
var vif
WC_bio1 21.5371434397055
WC_bio5 10.4639698247348
WC_bio15 7.69781293748316
WC_bio19 3.97039440920372
removed: WC_bio1 21.53714
# the retained environmental variables keep.env
[1] "WC_bio5" "WC_bio15" "WC_bio19"
<- subset(as.data.frame(env_var), select=c(keep.env))
reduced.env <- as.data.frame(scale(reduced.env))
scaled.env
# lets have another look
pairs.panels(reduced.env, scale=T, lm = TRUE)
3. Run analysis
So we have formatted genotypes, environmental data
What about controlling for spatial population structure in the model?
Do I need to control for spatial structure?
No? Great!
Yes? How do I do that?
- simple xy coordinates (or some transformation, MDS, polynomial…)
- Principal Coordinates of Neighbourhood Matrix (PCNM; R package vegan)
- Moran’s Eigenvector Maps (R package memgene)
- Fst, admixture Q values
- population allele frequency covariance matrix (BayPass omega)
We are going to use the scaled allelic covariance (Ω) estimated by BayPass to control for spatial structure in the data, see BayPass and Gates et al. 2023
# format spatial coordinates as matrix
<- as.matrix(cbind(env.site$omegaX, env.site$omegaY))
xy
# reduced RDA model using the retained environmental PCs
# conditioned on the retained spatial variables
<- rda(AF ~ WC_bio5 + WC_bio15 + WC_bio19 + Condition(xy), data = scaled.env)
Mf.RDA Mf.RDA
Call: rda(formula = AF ~ WC_bio5 + WC_bio15 + WC_bio19 + Condition(xy),
data = scaled.env)
Inertia Proportion Rank
Total 202.83442 1.00000
Conditional 9.32930 0.04599 2
Constrained 102.05924 0.50317 3
Unconstrained 91.44588 0.45084 8
Inertia is variance
Eigenvalues for constrained axes:
RDA1 RDA2 RDA3
85.00 12.97 4.09
Eigenvalues for unconstrained axes:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
66.70 6.36 5.17 3.75 3.10 2.86 2.41 1.10
# So how much genetic variation can be explained by our environmental model?
RsquareAdj(Mf.RDA)$r.squared
[1] 0.5031653
# how much inertia is associated with each axis
screeplot(Mf.RDA)
# calculate significance of the reduced model
<- anova.cca(Mf.RDA, nperm=1000) #test significance of the model
mod_perm mod_perm
Permutation test for rda under reduced model
Permutation: free
Number of permutations: 999
Model: rda(formula = AF ~ WC_bio5 + WC_bio15 + WC_bio19 + Condition(xy), data = scaled.env)
Df Variance F Pr(>F)
Model 3 102.059 2.9762 0.04 *
Residual 8 91.446
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#generate x and y labels (% constrained variation)
<- paste0("RDA1 (", paste(round((Mf.RDA$CCA$eig[1]/Mf.RDA$CCA$tot.chi*100),2)),"%)")
x.lab <- paste0("RDA2 (", paste(round((Mf.RDA$CCA$eig[2]/Mf.RDA$CCA$tot.chi*100),2)),"%)") y.lab
Plot RDA
#plot RDA1, RDA2
pdf(file = "./images/Mf_RDA.pdf", height = 6, width = 6)
{<- plot(Mf.RDA, choices = c(1, 2), type="n", cex.lab=1, xlab=x.lab, ylab=y.lab)
pRDAplot with(env.site, points(Mf.RDA, display = "sites", col = env.site$cols, pch = env.site$pch, cex=1.5, bg = env.site$cols))
text(Mf.RDA, "bp",choices = c(1, 2), labels = c("WC_bio5", "WC_bio15", "WC_bio19"), col="blue", cex=0.6)
legend("topleft", legend = env.site$site, col=env.site$cols, pch=env.site$pch, pt.cex=1, cex=0.50, xpd=1, box.lty = 0, bg= "transparent")
dev.off()}
4. Identify candidate loci
Estimate q-values following: https://kjschmidlab.gitlab.io/b1k-gbs/GenomeEnvAssociation.html#RDA_approaches and the R function rdadapt (see https://github.com/Capblancq/RDA-genome-scan)
= 2 # number of RDA axes to retain
k <- rdadapt(Mf.RDA, K = k)
rda.q sum(rda.q$q.values <= 0.01)
[1] 373
<- colnames(AF)[which(rda.q[,2] < 0.01)]
candidate.loci
# check the distribution of candidates and q-values
ggplot() +
geom_point(aes(x=c(1:length(rda.q[,2])),
y=-log10(rda.q[,2])), col="gray83") +
geom_point(aes(x=c(1:length(rda.q[,2]))[which(rda.q[,2] < 0.01)],
y=-log10(rda.q[which(rda.q[,2] < 0.01),2])), col="orange") +
xlab("SNPs") + ylab("-log10(q.values") +
theme_bw()
# check correlations between candidate loci and environmental predictors
# this section relies on code by Brenna Forester, found at
# https://popgen.nescent.org/2018-03-27_RDA_GEA.html
<- 3
npred <- matrix(nrow=(sum(rda.q$q.values < 0.01)),
env_mat ncol=npred) # n columns for n predictors
colnames(env_mat) <- colnames(reduced.env)
# calculate correlation between candidate snps and environmental variables
for (i in 1:length(candidate.loci)) {
<- candidate.loci[i]
nam <- AF[,nam]
snp.gen <- apply(reduced.env,2,function(x) cor(x,snp.gen))
env_mat[i,]
}
<- cbind.data.frame(as.data.frame(candidate.loci),env_mat)
cand
for (i in 1:length(cand$candidate.loci)) {
<- cand[i,]
bar +2] <- names(which.max(abs(bar[,2:4]))) # gives the variable
cand[i,npred+3] <- max(abs(bar[,2:4])) # gives the correlation
cand[i,npred
}
colnames(cand)[npred+2] <- "predictor"
colnames(cand)[npred+3] <- "correlation"
table(cand$predictor)
WC_bio15 WC_bio19 WC_bio5
216 47 110
write.csv(cand, "./data/RDA_candidates.csv", row.names = FALSE)
Individual level
Now let’s have a look at an individual level biplot first expand environmental data from population to individual dataframe.
<- expand_pop2ind(Mf5000_gl, env.site)
env.ind
# now subset our genotype objects to the candidate loci
<- gl.keep.loc(gi2gl(imputed.sorted.gi), loc.list=candidate.loci) candidate.gl
Starting gl.keep.loc
Starting gi2gl
Starting gl.compliance.check
Processing genlight object with SNP data
The slot loc.all, which stores allele name for each locus, is empty. Creating a dummy variable (A/C) to insert in this slot.
Checking coding of SNPs
SNP data scored NA, 0, 1 or 2 confirmed
Checking locus metrics and flags
Recalculating locus metrics
Checking for monomorphic loci
No monomorphic loci detected
Checking for loci with all missing data
No loci with all missing data detected
Checking whether individual names are unique.
Checking for individual metrics
Warning: Creating a slot for individual metrics
Checking for population assignments
Population assignments confirmed
Spelling of coordinates checked and changed if necessary to
lat/lon
Completed: gl.compliance.check
Completed: gi2gl
Processing genlight object with SNP data
List of loci to keep has been specified
Deleting all but the specified loci
Completed: gl.keep.loc
<- gl2gi(candidate.gl) candidate.gi
Starting gl2gi
Processing genlight object with SNP data
Matrix converted.. Prepare genind object...
Completed: gl2gi
# get genotypes (counts of reference allele) and clean up locus names
<- candidate.gi@tab
candidate.alleles <- candidate.alleles[,seq(1,ncol(candidate.alleles),2)]
candidate.snps colnames(candidate.snps) <- locNames(candidate.gi)
# another quick sanity check
stopifnot(all(env.ind$ind.names == indNames(candidate.gi)))
# finally run the individual based RDA
# no need to control for pop structure this time
<- rda(candidate.snps ~ WC_bio5 + WC_bio15 + WC_bio19, data = env.ind)
Mfcandidate.RDA Mfcandidate.RDA
Call: rda(formula = candidate.snps ~ WC_bio5 + WC_bio15 + WC_bio19,
data = env.ind)
Inertia Proportion Rank
Total 207.1000 1.0000
Constrained 59.6948 0.2882 3
Unconstrained 147.4052 0.7118 245
Inertia is variance
Eigenvalues for constrained axes:
RDA1 RDA2 RDA3
44.71 12.02 2.97
Eigenvalues for unconstrained axes:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
16.227 7.584 5.579 4.597 3.536 3.373 3.075 2.695
(Showing 8 of 245 unconstrained eigenvalues)
<- anova.cca(Mfcandidate.RDA, nperm=1000,
ind.mod_perm parallel=4) #test significance of the model
ind.mod_perm
Permutation test for rda under reduced model
Permutation: free
Number of permutations: 999
Model: rda(formula = candidate.snps ~ WC_bio5 + WC_bio15 + WC_bio19, data = env.ind)
Df Variance F Pr(>F)
Model 3 59.695 33.073 0.001 ***
Residual 245 147.405
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
<- paste0("RDA1 (",
x.lab paste(round((Mfcandidate.RDA$CA$eig[1]/Mfcandidate.RDA$tot.chi*100),2)),"%)")
<- paste0("RDA2 (",
y.lab paste(round((Mfcandidate.RDA$CA$eig[2]/Mfcandidate.RDA$tot.chi*100),2)),"%)")
Plot RDA
#plot RDA1, RDA2
pdf(file = "./images/Mfcandidate_RDA.pdf", height = 6, width = 6)
{<- plot(Mfcandidate.RDA, choices = c(1, 2),
pRDAplot type="n", cex.lab=1, xlab=x.lab, ylab=y.lab)
points(Mfcandidate.RDA, display = "sites", col = env.ind$cols,
pch = env.ind$pch, cex=0.8, bg = env.ind$cols)
text(Mfcandidate.RDA, "bp",choices = c(1, 2),
labels = c("WC_bio5", "WC_bio15", "WC_bio19"), col="blue", cex=0.6)
legend("topleft", legend = env.site$site, col=env.site$cols, pch=env.site$pch,
pt.cex=1, cex=0.50, xpd=1, box.lty = 0, bg= "transparent")
dev.off()
}
GWAS genotype and phenotype association
RandomForest
1 Input file
Let’s start with randomForest.
## Get input files
load(file = "./data/GWAS.RData")
##randomForest
$Size <- as.factor(CaRF$Size) CaRF
The input is a structure file with genotypes codified as counts of the major allele per individual per locus (0 hom minor allele (mA), 1 het, 2 hom major allele (MA)), with sample’s phenotype or trait in the first column instead of the population.
If the trait or phenotype is categorical instead of continuous, it should be transformed into a factor.
See(CaRF)
Size LG1.22769 LG1.197846 LG1.341740 LG1.428005 LG1.606435
class <factor> <integer> <integer> <integer> <integer> <integer>
popL_L026 2 -1 -1 -1 0 -1
popL_L004 2 -1 -1 -1 -1 0
popL_L024 2 -1 -1 -1 -1 -1
popL_L010 2 -1 -1 -1 -1 -1
popL_L025 2 -1 -1 -1 -1 0
popL_L016 2 -1 -1 -1 -1 -1
[1] "class: data.frame"
[1] "dimension: 363 x 2063"
table(CaRF$Size)
1 2
180 183
So here we have the 363 samples, (180 small and 183 large), genotyped for a subset of 2,062 markers from the 17,490 original dataset.
2 Model tunning
The first step is to crate our training and validation data in a 70:25 proportion.
Then we run a simple forest to see performance without tuning.
set.seed(123)
<- sample(2, nrow(CaRF), replace = TRUE, prob = c(0.75, 0.25))
ind <- CaRF[ind==1,]
train <- CaRF[ind==2,]
test
<- randomForest(Size~., data=train,
rf ntree = 50, # number of trees
mtry = 10, # number of variables tried at each split
importance = TRUE,
proximity = TRUE)
print(rf)
Call:
randomForest(formula = Size ~ ., data = train, ntree = 50, mtry = 10, importance = TRUE, proximity = TRUE)
Type of random forest: classification
Number of trees: 50
No. of variables tried at each split: 10
OOB estimate of error rate: 27.44%
Confusion matrix:
1 2 class.error
1 98 39 0.2846715
2 37 103 0.2642857
But if you run this again you get slightly different results, so we need to do some replications using different training data.
<- trainControl(method="oob", number=5)
control set.seed(123)
<- "Accuracy"
metric <- 10
mtry <- expand.grid(.mtry=mtry )
tunegrid <- train(Size~., data=CaRF, method="rf", metric=metric, tuneGrid=tunegrid, trControl=control, ntree=50)
rf_small rf_small
Random Forest
363 samples
2062 predictors
2 classes: '1', '2'
No pre-processing
Resampling results:
Accuracy Kappa
0.7410468 0.4818695
Tuning parameter 'mtry' was held constant at a value of 10
Here we have also Kappa, a measure of how much better the model classification is than a random classification. This is important when there is a significant skew in sample sizes.
Now let’s check what the default parameter can do.
<- trainControl(method="oob", number=5)
control set.seed(123)
<- "Accuracy"
metric <- sqrt(ncol(CaRF))
mtry <- expand.grid(.mtry=mtry )
tunegrid <- train(Size~., data=CaRF, method="rf", metric=metric, tuneGrid=tunegrid, trControl=control)
rf_default rf_default
Random Forest
363 samples
2062 predictors
2 classes: '1', '2'
No pre-processing
Resampling results:
Accuracy Kappa
0.7465565 0.4928012
Tuning parameter 'mtry' was held constant at a value of 45.42026
70% accuracy is no too bad, but can we improve it?
MODEL TUNING: Do not RUN now
The next couple of steps could take too long, so here are the commands for you to try in your own time.
<- trainControl(method="oob", number=10, search="random")
control set.seed(123)
<- sqrt(ncol(CaRF))
mtry <- train(Size~., data=CaRF, method="rf", metric=metric, tuneLength=15, trControl=control)
rf_random
(rf_random)plot(rf_random)
####Expected output
# mtry Accuracy Kappa
# 195 0.7703453 0.5403670
# 348 0.7685185 0.5367943
# 526 0.7684685 0.5367281
# 665 0.7721972 0.5441283
# 953 0.7730731 0.5458833
# 1011 0.7694444 0.5386467
# 1017 0.7702202 0.5401206
# 1038 0.7685435 0.5367490
# 1115 0.7740240 0.5477867
# 1142 0.7750000 0.5497308
# 1253 0.7740490 0.5478857
# 1268 0.7740240 0.5477339
# 1627 0.7768018 0.5533787
# 1842 0.7768268 0.5533826
# 2013 0.7749750 0.5496527
You can test a combination of ntree and mtry values. Unfortunately, this is the most time-consuming part.
But here is the code for you to try.
<- trainControl(method="repeatedcv", number=10, repeats=3, search="grid")
control <- expand.grid(.mtry= c(565, 665, 765))
tunegrid <- list()
modellist <-123
seed<- "Accuracy"
metric for (ntree in c(200, 300, 500, 1000)) {
set.seed(seed)
<- train(Size~., data=CaRF, method="rf", metric=metric, tuneGrid=tunegrid, trControl=control, ntree=ntree)
fit <- toString(ntree)
key <- fit
modellist[[key]]
}# Then you can compare tuning results
<- resamples(modellist)
results summary(results)
dotplot(results)
Back to the exercise
3 Final model
But the best combination was ntree = 200 and mtry= 665.
Now we run our forest with the optimal settings based on our tuning.
<- trainControl(method="oob", number=5)
control set.seed(123)
<- "Accuracy"
metric <- 665
mtry <- expand.grid(.mtry=mtry)
tunegrid <- train(Size~., data=CaRF, method="rf", metric=metric, tuneGrid=tunegrid, trControl=control, ntree=200)
rf_best rf_best
Random Forest
363 samples
2062 predictors
2 classes: '1', '2'
No pre-processing
Resampling results:
Accuracy Kappa
0.7823691 0.5645301
Tuning parameter 'mtry' was held constant at a value of 665
~ 3% accuracy gain looks like not much, but a disclaimer, these are just SNPs from the 4 chromosomes we already identified as important for growth.
4 Candidates SNPs
Now just check what are this markers.
varImpPlot(rf_best$finalModel)
varImpPlot(rf)
We can determine their importance using their effect on two metrics:
Accuracy - Ratio of correct classifications to total number of samples Gini - Measurement of the cleanliness of feature split in the tree
Ok let’s change approach and check concordance:
RAINBOWR
1 Inputs
First, we integrate our inputs in to a GWAS format for RAINBOWR.
And then we check each dataset format:
Phenotype is just two columns, ID and phenotype. Genotype is same format as randomforest, but -1 hom mA 0 het 1 hom MA. Map file with the chromosome and position of each SNP.
<- modify.data(pheno.mat = CaGW_phe, geno.mat = CaGW_geno, map = CaGW_map, return.ZETA = TRUE, return.GWAS.format = TRUE)
modify.CaGW.res <- modify.CaGW.res$pheno.GWAS
pheno.GWAS <- modify.CaGW.res$geno.GWAS
geno.GWAS
### View each dataset
See(pheno.GWAS)
Sample_names Size
class <character> <integer>
popL_L026 popL_L026 2
popL_L004 popL_L004 2
popL_L024 popL_L024 2
popL_L010 popL_L010 2
popL_L025 popL_L025 2
popL_L016 popL_L016 2
[1] "class: data.frame"
[1] "dimension: 363 x 2"
See(geno.GWAS)
marker chrom pos popL_L026 popL_L004 popL_L024
class <character> <integer> <integer> <integer> <integer> <integer>
1 LG1:10070023 1 10070023 -1 -1 -1
2 LG1:10196551 1 10196551 -1 -1 -1
3 LG1:10271823 1 10271823 -1 -1 -1
4 LG1:10361941 1 10361941 0 -1 -1
5 LG1:10398562 1 10398562 -1 0 -1
6 LG1:10529180 1 10529180 -1 -1 -1
[1] "class: data.frame"
[1] "dimension: 17490 x 366"
See(CaGW_map)
marker chr pos
class <character> <integer> <integer>
LG1:10070023 LG1:10070023 1 10070023
LG1:10196551 LG1:10196551 1 10196551
LG1:10271823 LG1:10271823 1 10271823
LG1:10361941 LG1:10361941 1 10361941
LG1:10398562 LG1:10398562 1 10398562
LG1:10529180 LG1:10529180 1 10529180
[1] "class: data.frame"
[1] "dimension: 17490 x 3"
2 Single-SNP base GWAS
Now we calculate ZETA, a list of genomic relationship matrix (GRM). ZETA is basically a random effect and accounts for possible confounding/covariant factors like population structure and relatedness.
<- modify.CaGW.res$ZETA ZETA
### Population structure, using a structure Q values matrix, n.PC should be > 0
<- RGWAS.normal(pheno = pheno.GWAS, geno = geno.GWAS, ZETA = ZETA, structure.matrix= QMATRIX, n.PC = 4, P3D = TRUE)
normal.res
#### other cofactors, like sex or age (a two-column table similar to phenotype table)
<- RGWAS.normal(pheno = pheno.GWAS, geno = geno.GWAS, ZETA = ZETA, structure.matrix= QMATRIX, covariate = SEX_AGE, n.PC = 4, P3D = TRUE) normal.res
Now, to perform single-SNP GWAS.
<- RGWAS.normal(pheno = pheno.GWAS, geno = geno.GWAS, ZETA = ZETA, n.PC = 4, P3D = TRUE) normal.res
We can see, first the QQ plot, it shows that our data has a little deviation from the expected distribution, but this is just at the end of the tail.
Second, the Manhattan plot shows few SNPs that are significantly associated to size, but more importantly, it shows a peak of relatively high-importance SNPs at the end of chromosome 16.
So, what are the most important important SNPs?
$D[normal.res$D$Size >= 4, ] normal.res
marker chrom pos Size
543 LG1:34671481 1 34671481 6.266137
8335 LG11:34898993 11 34898993 4.304650
11893 LG16:4658138 16 4658138 4.758395
11942 LG16:6898540 16 6898540 5.501243
11992 LG16:8984283 16 8984283 4.677513
11995 LG16:8986211 16 8986211 5.995467
12004 LG16:9625373 16 9625373 4.453957
12005 LG16:9719769 16 9719769 4.451432
12007 LG16:9838071 16 9838071 4.261391
11793 LG16:28880624 16 28880624 5.004830
15566 LG22:2463072 22 2463072 4.568311
17287 CauratusV1_scaffold_3461:3888 10141 3888 4.214943
3 Genomic prediction
How much of the phenotype can we really predict?
To determine that, we use a mixed model approach to calculate heritability, in other words the genetic contribution to our trait variation.
<-calcGRM(genoMat = CaGW_geno) ### aditive effects
K.A2<- K.A2 * K.A2 ### epistatic effects
k.AA2
<- as.matrix(pheno.GWAS[,'Size', drop = FALSE])
y <- design.Z(pheno.labels = rownames(y), geno.names = rownames(K.A2))
Z <- y[rownames(Z), , drop = FALSE]
pheno.mat2 <- list(A = list(Z = Z, K = K.A2),AA = list(Z = Z, K = k.AA2))
ZETA2 <- EM3.cpp(y = pheno.mat2, X = NULL, ZETA = ZETA2) ### multi-kernel linear mixed effects model
EM3.res <- EM3.res$Vu) ### estimated genetic variance (Vu
[1] 0.1655835
<- EM3.res$Ve) ### estimated residual variance (Ve
[1] 0.07379584
<- EM3.res$weights) ### estimated proportion of two genetic variances (weights
[1] 0.91355965 0.08644035
<- Vu * weights / (Vu + Ve)) ### genomic heritability (herit
[1] 0.63192756 0.05979253
herit
[1] 0.63192756 0.05979253
Now we can test the performance of our genomic prediction with 10-fold cross-validation.
If you have NAs in your phenotype dataset you have to remove them.
<- pheno.mat2[noNA, , drop = FALSE] ### remove NA phenoNoNA
But our data doesn’t have any NAs.
<- 10
nFold <- nrow(pheno.mat2)
nLine <- sample(1:nLine %% nFold) ### assign random ids for cross-validation
idCV == 0] <- nFold
idCV[idCV
<- rep(NA, nLine)
yPred for (noCV in 1:nFold) {
print(paste0("Fold: ", noCV))
<- pheno.mat2
yTrain == noCV, ] <- NA ### prepare test data
yTrain[idCV <- EM3.general(y = yTrain, X0 = NULL, ZETA = ZETA,
EM3.gaston.resCV package = "gaston", return.u.always = TRUE,
pred = TRUE, return.u.each = TRUE,
return.Hinv = TRUE)
<- EM3.gaston.resCV$y.pred ### predicted values
yTest == noCV] <- yTest[idCV == noCV]
yPred[idCV }
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
Plot Genomic Prediction.
<- range(pheno.mat2, yPred)
plotRange plot(x = pheno.mat2, y = yPred,xlim = plotRange, ylim = plotRange,
xlab = "Observed values", ylab = "Predicted values",
main = "Results of Genomic Prediction (multi-kernel)",
cex.lab = 1.5, cex.main = 1.5, cex.axis = 1.3)
abline(a = 0, b = 1, col = 2, lwd = 2, lty = 2)
<- cor(x = pheno.mat2[, 1], y = yPred) ^ 2
R2 text(x = plotRange[2] - 10,
y = plotRange[1] + 10,
paste0("R2 = ", round(R2, 3)),
cex = 1.5)
Not really the best prediction power, and even worse than randomForest just using ~2K SNPs.
It is important to note that combinations of methods can improve our power to predict complex traits that are polygenic.
Using haplotype-based approaches and Bayesian methods helps a lot.
Annotation of the regions of chromosome 16 shows presence of several genes related to growth and body development.
Further steps
It is time to test how structural variances can improve the power of GWAS.
Readings
Attard et al. (2022)
Brauer et al. (2018)
Batley et al. (2021)
Smith et al. (2020)
Brauer et al. (2023)
Grummer et al. (2019)
Pratt et al. (2022)
Sandoval-Castillo et al. (2018)