MAPpoly training section�
Marcelo Mollinari
mmollin@ncsu.edu
SCRI Workshop – January 14, 2021
This work is licensed under CC BY 4.0
Outline
Initial notes about MAPpoly
Installation
install.packages("mappoly")
install.packages("devtools")
devtools::install_github("mmollina/mappoly", dependencies=TRUE)
Reading/importing data sets
Supported datasets
Supported R objects
data set
mappoly.data
Reading/importing datasets
1. MAPpoly
dat.mappoly <- read_geno(file.in = 'path_to_input_file')
2. MAPpoly with genotype probabilities
dat.mappoly.prob <- read_geno_prob(file.in = 'path_to_input_file)
3. CSV
dat.csv <- read_geno_csv(file.in = 'path_to_input_file', ploidy = 6)
4. fitPoly, supports genotype probabilities
dat.fitpoly <- read_fitpoly(file.in = 'path_to_input_file', ploidy = 6,
parent1 = 'P1', parent1 = 'P2')
5. VCF files, supports genotype probabilities (PL field)
dat.vcf <- read_vcf(file.in = 'path_to_input_file', ploidy = 6,
parent1 = 'P1', parent1 = 'P2',
min.gt.depth = 0, min.av.depth = 0)
Reading/importing datasets
1. updog
dat.updog <- import_from_updog(object = input_multidog_object)
2. polyRAD
dat.polyRAD <- polyRAD::Export_MAPpoly(object = 'polyrad_object')
3. polymapR
dat.polymapr <- import_data_from_polymapR(input.data = polymapR_screened_data,
ploidy = 6, parent1 = 'P1',
parent1 = 'P2')
map.polymapr <- import_phased_maplist_from_polymapR(maplist = phased_maplist,
mappoly.data = mappoly.data)
Importing data
Importing maps
Reading/importing datasets
## Downloading data from GitHub
library("mappoly")
setwd("SCRI/MAPpoly/tetra")
address <- "https://github.com/mmollina/SCRI/raw/main/data/fitpoly_tetra_call/B2721_scores.zip"
download.file(url = address, destfile = "B2721_scores.zip")
unzip(tempfl, files = "B2721_scores.dat")
## Reading fitPoly data
dat <- read_fitpoly(file.in = "B2721_scores.dat",
ploidy = 4,
parent1 = "Atlantic",
parent2 = "B1829",
verbose = TRUE)
source("get_solcap_snp_pos.R")
plot(dat)
print(dat, detailed = TRUE)
Genotype calling using fitPoly:
https://github.com/mmollina/SCRI/blob/main/data/fitpoly_tetra_call/B2721_fitpoly_call.R
Adding Solanum tuberosum genome v4.03 information:
https://github.com/mmollina/SCRI/blob/main/MAPpoly/get_solcap_snp_pos.R
Inspecting data loaded from fitPoly files
Inspecting specific marker
plot_mrk_info(dat, 738)
print_mrk(dat, 738)
plot_mrk_info(dat, "solcap_snp_c1_3722")
print_mrk(dat, "solcap_snp_c1_3722")
Filtering by missing data
dat <- filter_missing(input.data = dat,
type = "marker",
filter.thres = 0.05)
dat <- filter_missing(input.data = dat,
type = "individual",
filter.thres = 0.025)
Filtering by segregation distortion
s <- filter_segregation(input.data = dat, chisq.pval.thres = 0.05/dat$n.mrk)
s <- make_seq_mappoly(s)
plot(s)
Making a sequence of markers
mappoly.sequence
mappoly.data
mappoly.group
mappoly.map
mappoly.chitest.seq
mappoly.pcmap
mappoly.geno.ord
Two-point analysis
## Two-point analysis
nc <- parallel::detectCores() - 1
tpt10 <- est_pairwise_rf(s10, ncpus = nc, est.type = "disc")
tpt10$pairwise[90:92]
round(tpt10$pairwise[[91]], 2)
plot(tpt10, first.mrk = 1, second.mrk = 2074)
Atlantic
B1829-5
1
2074
1
2074
Recombination fraction matrix
m <- rf_list_to_matrix(tpt, ncpus = nc)
gen.ord <- get_genomic_order(s)
s.gen.ord <- make_seq_mappoly(gen.ord)
plot(m, ord = s.gen.ord$seq.mrk.names, fact = 10)
Filtering by pairwise recombination fraction
gr <- group_mappoly(m, expected.groups = 12,
comp.mat = TRUE)
gr
Ordering within a specific linkage group
s1 <- make_seq_mappoly(gr, arg = 1,
genomic.info = 1)
tpt1 <- make_pairs_mappoly(tpt, s1)
m1 <- make_mat_mappoly(m, s1)
##Genomic order
g.o1 <- get_genomic_order(s1)
s1.g <- make_seq_mappoly(g.o1)
plot(g.o1)
plot(m1, ord = s1.g$seq.mrk.names,
fact = 3)
Genome order
Ordering within a specific linkage group
mds.o1 <- mds_mappoly(m1, n = c(201,47))
plot(mds.o1)
s1.mds <- make_seq_mappoly(mds.o1)
plot(m1, ord = s1.mds$seq.mrk.names)
MDS order
Phasing and HMM estimation of distance
lg1.geno.map<-est_rf_hmm_sequential(input.seq = s1.g,
start.set = 3,
thres.twopt = 10,
thres.hmm = 50,
extend.tail = 30,
twopt = tpt1,
verbose = TRUE,
tol = 10e-2,
tol.final = 10e-4,
phase.number.limit = 20,
sub.map.size.diff.limit = 5,
info.tail = TRUE)
Phasing and HMM estimation of distance
Argument | Value |
start.set | 3 – 10 |
thres.twopt | 10 |
thres.hmm | 10 – 50 |
extend.tail | 30 – 100 |
phase.number.limit | 20 |
sub.map.size.diff.limit | 2 (thousands per LG) … 10 (hundred(s) per LG) |
Argument | Value |
start.set | 3 – 5 |
thres.twopt | 10 |
thres.hmm | 10 – 50 |
extend.tail | 50 – 200 |
phase.number.limit | 20 |
sub.map.size.diff.limit | 2 (thousands per LG) … 10 (hundred(s) per LG) |
Tetraploid
Hexaploid
Resulting map
plot(lg1.geno.map, mrk.names = TRUE, cex = 0.5, P = "Atlantic", Q = "B1829")
Resulting map – error modeling
lg1.geno.map.err <- est_full_hmm_with_global_error(lg1.geno.map, error = 0.05, tol = 10e-4)
plot(lg1.geno.map.err, mrk.names = TRUE, cex = 0.5, P = "Atlantic", Q = "B1829")
Resulting map: 20-30 cM segment
plot(lg1.geno.map.err,
mrk.names = TRUE,
left.lim = 19,
right.lim = 20,
P = "Atlantic",
Q = "B1829")
Probabilistic haplotype reconstruction
g.lg10 <- calc_genoprob(lg10.geno.map, step = 1)
h.lg10 <- calc_homoprob(g.lg10)
plot(h.lg10, ind = "B2721.110")
Probabilistic haplotype reconstruction
g.lg10.err <- calc_genoprob_error(lg10.geno.map.err, step = 1, error = 0.05)
h.lg10.err <- calc_homoprob(g.lg10.err)
plot(h.lg10.err, ind = "B2721.110")
Preferential pairing profiles
p1 <- calc_prefpair_profiles(g1)
plot(p1, min.y.prof = 0.25, max.y.prof = 0.40)
Parallel map construction – genomic order
#### Parallel map construction
cl <- parallel::makeCluster(12)
parallel::clusterEvalQ(cl, require(mappoly))
parallel::clusterExport(cl, "dat")
# ~12.5 minutes
MAPs.geno <- parallel::parLapply(cl, LGS, phasing_and_hmm_rf)
# ~2.5 minutes
MAPs.geno.err <- parallel::parLapply(cl, MAPs.geno,
error_model)
# ~22 seconds
genoprob <- parallel::parLapply(cl,
MAPs.geno.err,
calc_genoprob_error,
step = 1,
error = 0.05)
parallel::stopCluster(cl)
#### Functions ####
phasing_and_hmm_rf <- function(X){
dir.create("map_output", showWarnings = FALSE)
fl <- paste0("output_map_ch_", X$seq$sequence[1], ".txt")
fl <- file.path("map_output", fl)
sink(fl)
map <- est_rf_hmm_sequential(input.seq = X$seq,
start.set = 3,
thres.twopt = 10,
thres.hmm = 50,
extend.tail = 30,
twopt = X$tpt,
verbose = TRUE,
phase.number.limit = 20,
sub.map.size.diff.limit = 5)
sink()
return(map)
}
error_model <- function(X, error = 0.05, tol = 10e-4){
x <- est_full_hmm_with_global_error(input.map = X,
error = error,
tol = tol,
verbose = FALSE)
return(x)
}
#### Map results
map.out <- plot_map_list(MAPs.geno.err, col = "ggstyle")
map.out
plot_genome_vs_map(MAPs.geno.err, same.ch.lg = TRUE)
summary_maps(MAPs.geno.err)
export_map_list(MAPs.geno.err, file = "output_map.csv")
#### Preferential pairing
pp <- calc_prefpair_profiles(genoprob)
print(pp)
head(pp$prefpair.psi)
plot(pp, P = "Atlantic", Q = "B1829")
#### Haplotype probabilities
hp <- calc_homoprob(genoprob)
print(hp)
plot(hp, ind = 2, lg = 1)
plot(hp, ind = 2, lg = 1:12, use.plotly = FALSE)
#### Correspondence with genome
z<-as.numeric(colnames(gr$seq.vs.grouped.snp)[1:12])
#### Assembling linkage groups (order based on genome)
LGS<-vector("list", 12)
for(ch in 1:12){
cat("\n ~~~~~~ ch:", ch, "...\n")
lg <- which(z==ch)
s.temp<-make_seq_mappoly(gr, lg, genomic.info = 1)
tpt.temp <- make_pairs_mappoly(tpt, s.temp)
s.temp.filt <- rf_snp_filter(tpt.temp, 5, 5, 0.15, c(0.05, 1))
m.temp <- make_mat_mappoly(m, s.temp.filt)
g.o <- get_genomic_order(s.temp)
s.g <- make_seq_mappoly(g.o)
tpt.temp <- make_pairs_mappoly(tpt, input.seq = s.g)
LGS[[ch]] <- list(seq = s.g, tpt = tpt.temp)
}
Results
Results – phased map
Results
On-line resources
�APPENDIX��Hexaploid map – Results��
Hexaploid map – Results
Biparental Population - BT
Beauregard
Tanzania
X
Biparental Population - BT
Results for hexaploid sweetpotato BT population
Recombination fraction matrix and grouping
Ordered recombination fraction matrix
Map results
Homologs probability
Preferential pairing profiles