simgen/popgen-tracts-dating.qmd at main · bodkan/simgen · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# Admixture tracts and dating

**WIP:** Extracting ancestry tracts, chromosome painting, and using
admixture tracts and LD patterns for dating admixture events.

```{r}
library(slendr)
init_env()

library(cowplot)

# helper function for plotting tracts simulated later
plot_tracts <- function(tracts, ind) {
  ind_tracts <- filter(tracts, name %in% ind) %>%
    mutate(haplotype = paste0(name, "\nhap. ", haplotype))
  ind_tracts$haplotype <- factor(ind_tracts$haplotype, levels = unique(ind_tracts$haplotype[order(ind_tracts$node_id)]))

  ggplot(ind_tracts) +
    geom_rect(aes(xmin = left, xmax = right, ymin = 1, ymax = 2, fill = name), linewidth = 1) +
    labs(x = "coordinate along a chromosome [bp]") +
    theme_bw() +
    theme(
      legend.position = "none",
      axis.text.y = element_blank(),
      axis.ticks = element_blank(),
      panel.border = element_blank(),
      panel.grid = element_blank()
    ) +
    facet_grid(haplotype ~ .) +
    expand_limits(x = 0) +
    scale_x_continuous(labels = scales::comma)
}

popZ <- population("popZ", time = 3000, N = 5000)
popX <- population("popX", time = 1500, N = 5000, parent = popZ)
popY <- population("popY", time = 1500, N = 5000, parent = popZ)

gf <- gene_flow(from = popX, to = popY, rate = 0.2, start = 800, end = 799)

model <- compile_model(list(popZ, popX, popY), generation_time = 1, gene_flow = gf)

schedule <- rbind(
  schedule_sampling(model, times = seq(1500, 0, by = -100), list(popY, 50)),
  schedule_sampling(model, times = 0, list(popX, 10), list(popZ, 10))
)

# Use the function plot_model() to make sure that the model and the sampling schedule
# are defined correctly (there's no such thing as too many sanity checks when doing research)
plot_model(model, proportions = TRUE, samples = schedule)

ts <- msprime(model, samples = schedule, sequence_length = 100e6, recombination_rate = 1e-8)


library(dplyr)
library(ggplot2)

tracts <- ts_tracts(ts, census = 800)
tracts

sample_times <- ts_samples(ts) %>% select(name, time)

# adding haplotype ID and time (automatically done by the dev slendr)
tracts <- tracts %>% dplyr::group_by(name) %>% dplyr::mutate(haplotype = dplyr::dense_rank(node_id)) %>% dplyr::ungroup() %>% inner_join(sample_times)

# Select sampled individuals from different ages (remember, we recorded oldest individuals
# at 1000 generations ago, the youngest individuals at 0 generations ago). Use the function
# plot_tracts to visualize their ancestry tracts. What do you see in terms of the number of
# tracts and the length of tracts across these individuals? Can you eyeball some distinctive
# pattern?

subset_inds <- tracts %>% group_by(time) %>% distinct(name) %>% slice_sample(n = 1) %>% pull(name)
subset_inds

plot_tracts(tracts, subset_inds)

# It looks like the older individual is, the closer they lived to the start of the admixture event,
# and the longer the tracts they carry will be. Compute the average length of a ancestry tracts in each
# sample age group and visualize the length distribution of these tracts based on their age:

tracts %>%
  group_by(time) %>%
  summarise(mean(length))

ggplot(tracts, aes(length, color = factor(time))) +
  geom_density() +
  coord_cartesian(xlim = c(0, 3e6)) +
  theme_bw()


# Now, try to work backwards. Assuming you have the following distribution of tract lengths...

tracts <- filter(tracts, time == 0, length <= 1e6)
bins <- hist(tracts$length, breaks = 50, plot = FALSE)
length <- bins$mids
density <- bins$density

plot(length, density)

lambda_mle <- 1 / mean(tracts$length)
lambda_mle / 1e-8

y_mle <- dexp(length, rate = lambda_mle)
lines(length, y_mle, lty = 2, col = "darkgreen", lwd = 2)

nls_res <- nls(density ~ SSasymp(length, Asym, R0, lrc))
nls_res

lambda_nls <- exp(unname(coef(nls_res)["lrc"]))
lambda_nls / 1e-8

y_nls <- predict(nls_res, newdata = data.frame(length = length))
lines(length, y_nls, lty = 2, col = "purple", lwd = 2)

legend("topright", fill = c("darkgreen", "purple"),
       legend = c(paste("MLE, t =", round(lambda_mle / 1e-8, 1), "generations ago"),
                  paste("MLE, t =", round(lambda_nls / 1e-8, 1), "generations ago")))

```