Import and data prep

Import packages:

library(stringr)
library(data.table)
library(ggplot2)
library(paletteer)
library(lubridate)
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(scales)
library(vegan)
library(seqinr)
library(pegas)


# Suppress summarise info
options(dplyr.summarise.inform = FALSE)

Read in SPSP surveillance data and prepare data:

# read in data
spsp = fread('data/spsp_data/SPSP-dump_220805.tsv')
spsp$week = format(spsp$sample.isolation_date, format='%Y-%V') #%V is iso week - week starts with monday
spsp$week[spsp$week == '2021-53'] = '2020-53' #correct for weird error in formatting function
spsp$week[spsp$week == '2022-52'] = '2021-52'
spsp$count = rep(1, nrow(spsp))
spsp$annotation_pangolin.lineage = as.factor(spsp$annotation_pangolin.lineage)

# swiss population
swisspop = 8670300 # 2020 census; source: https://www.bfs.admin.ch/bfs/en/home/statistics/population.html 


# extract columns of interest
spsp_sub = as.data.frame(spsp) %>% select(date=sample.isolation_date, week=week, pango=assembly.annotation_pangolin.lineage, pango_simpl = assembly.annotation_pangolin.lineage, strain=sample.strain_name, canton=sample_origin_location.country_division, submission_date=submission.date, purpose=sample.sequencing_purpose, gisaid=gisaid_info.id_ext)
spsp_sub$date = as.Date(spsp_sub$date)
spsp_sub$submission_date = as.Date(spsp_sub$submission_date)

# simplify pango lineages for VOCs
spsp_sub$pango_simpl = str_replace(spsp_sub$pango_simpl, '^B\\.1\\.1\\.7$|^Q\\..+', 'alpha')
spsp_sub$pango_simpl = str_replace(spsp_sub$pango_simpl, '^AY\\..+|^B\\.1\\.617\\..*', 'delta')
spsp_sub$pango_simpl = str_replace(spsp_sub$pango_simpl, '^B.1.1.529$', 'omicron')
spsp_sub$pango_simpl = str_replace(spsp_sub$pango_simpl, '^BA\\.1\\..+|^BA\\.1$|^BC\\.1|^BD\\.1', 'omicron1')
spsp_sub$pango_simpl = str_replace(spsp_sub$pango_simpl, '^BA\\.2\\..+|^BA\\.2$', 'omicron2')
spsp_sub$pango_simpl = str_replace(spsp_sub$pango_simpl, '^BA\\.3\\..+|^BA\\.3$', 'omicron3')
spsp_sub$pango_simpl = str_replace(spsp_sub$pango_simpl, '^BA\\.4\\..+|^BA\\.4$', 'omicron4')
spsp_sub$pango_simpl = str_replace(spsp_sub$pango_simpl, '^BA\\.5\\..+|^BA\\.5$', 'omicron5')
spsp_sub$pango_simpl = str_replace(spsp_sub$pango_simpl, '^BA\\.5\\..+|^BA\\.5|^BF\\.1|^BE\\.1', 'omicron5')
spsp_sub$pango_simpl = str_replace(spsp_sub$pango_simpl, '^XD$|^XAD$|^XE$|^XAB$|^XM$|XAG', 'omicron_rec')
spsp_sub$pango_simpl = str_replace(spsp_sub$pango_simpl, '^P\\.1\\..+|^P\\.1$', 'gamma')
spsp_sub$pango_simpl = str_replace(spsp_sub$pango_simpl, '^B\\.1\\.351$|^B\\.1\\.352\\..*', 'beta')
spsp_sub$pango_simpl = as.factor(spsp_sub$pango_simpl)

# correct for (obviously) incorrect collection dates
# alpha
spsp_sub=spsp_sub[-which(spsp_sub$pango_simpl == 'alpha' & spsp_sub$date < '2020-11-09'),]
# delta
spsp_sub=spsp_sub[-which(spsp_sub$pango_simpl == 'delta' & spsp_sub$date < '2021-03-01'),]

# remove "outbreak suspicions" to improve untargeted cluster finding 
spsp_sub = spsp_sub[-which(spsp_sub$purpose %in% 'OUTBREAK_SUSPICION'),]


# LINEAGE COLOURS
df.mycol=read.table('data/colours_lineages_complete.tsv', header=F, sep='\t', comment.char = '')
mycol = unlist(df.mycol$V2)
names(mycol) = unlist(df.mycol$V1)
vocs = c('alpha', 'delta', 'omicron')

Analysis of situation in Switzerland during the pandemic

Turn-around-time

TAT during official FOPH surveillance period from March 2011 to April 2022:

# turn around time
spsp_sub$tat = spsp_sub$submission_date - spsp_sub$date
# subset for surveillance period
spsp_sub_surv = spsp_sub[spsp_sub$date >= '2021-03-01' & spsp_sub$date <= '2022-03-31',]
median(spsp_sub_surv$tat)
## Time difference of 18 days
# interquartile range
IQR(spsp_sub_surv$tat)
## [1] 14

SARS-CoV-2 weekly cases, sequencing and lineage distribution

spsp_summary_week = by(spsp_sub[,'pango'], list(spsp_sub$week), table)

df_sequed_week= data.frame(Var1=character(), Freq=numeric(), week=character())
for (i in 1:length(spsp_summary_week)) {
  dat = data.frame(spsp_summary_week[[i]])
  dat$week = labels(spsp_summary_week[i])
  df_sequed_week = rbind(df_sequed_week,dat)
}
colnames(df_sequed_week)=c("pango","value","week")
df_sequed_week = df_sequed_week[df_sequed_week$value > 0,]

df_sequed_week$week = as.factor(df_sequed_week$week)
df_sequed_week$pango = as.factor(df_sequed_week$pango)

data = df_sequed_week %>% group_by(week, pango, .drop=FALSE) %>% summarise(n=sum(value)) %>% mutate(percentage = n/sum(n))



# PROPORTIONAL STACKED AREA PLOT OF WEEKLY CASES AND CASES DATA
cases = read.csv('data/sars_data_CH/sars_daily-cases.csv')
cases$datum = as.Date(cases$datum)
cases$week = format(cases$datum, format='%Y-%V')
cases$week[cases$week == '2021-53'] = '2020-53'
cases$week[cases$week == '2022-52'] = '2021-52'
end_date = max(spsp$sample.isolation_date)
cases = cases %>% select(geoRegion=geoRegion, date=datum, week=week, entries=entries, sumTotal=sumTotal) %>% filter(date <= end_date)

cases_CH = cases[cases$geoRegion == 'CH',]
cases_week_CH = aggregate(cases_CH$entries, by=list(week=cases_CH$week), FUN=sum)
cases_week_CH$week = as.factor(cases_week_CH$week)


# plot % sequenced cases per week
cases_week_CH$sequed = unlist(lapply(cases_week_CH$week, function(this.week){nrow(spsp_sub[spsp_sub$week %in% this.week,])}))
cases_week_CH$percSequed = cases_week_CH$sequed/cases_week_CH$x*100


pango_new = unlist(list(data$pango, as.factor(rep('', nrow(cases_week_CH)))))
data_cases = data.frame(week = c(data$week, cases_week_CH$week), var=c(rep('lineages', nrow(data)), rep('cases', nrow(cases_week_CH))),
                        pango=as.factor(pango_new), n = c(data$n, cases_week_CH$x))
# $lineages --> number of sequences in SPSP
# $cases --> number of cases

data_cases2 = rbind(data_cases, data.frame(week=cases_week_CH$week, var=rep('percSeq', nrow(cases_week_CH)), pango=rep('lightblue', nrow(cases_week_CH)), n=cases_week_CH$percSequed))
data_cases2$var = factor(data_cases2$var, levels=c('cases', 'percSeq', 'lineages'))
# colour vector
mycol2 = c(hue_pal()(length(levels(data$pango))), 'dimgrey', 'lightblue')
names(mycol2) = levels(data_cases2$pango)

p = ggplot(data=data_cases2, aes(x=week, y=n, group=pango, fill=pango))
p = p + geom_area(size=.2, alpha=.7)
p = p + scale_fill_manual('pango', values=mycol2)
p = p + scale_x_discrete('calendar week')
p = p + scale_y_continuous('')
p = p + facet_wrap(vars(var), nrow=3, scales='free_y', labeller = labeller(var=c('cases' = '# of positive cases', 'percSeq' = '% of cases sequenced', 'lineages'='# of sequenced cases')))
p = p + theme_minimal()  + theme(text = element_text(size=12), axis.text.x = element_text(angle=90, hjust=1)) + guides(fill="none")
print(p)

p = ggplot(data, aes(x=week, y=percentage, group=pango, fill=pango))
p = p + geom_area(linewidth=.2, alpha=.7)
p = p + scale_fill_manual('pango', values=mycol)
p = p + scale_x_discrete('calendar week')
p = p + scale_y_continuous('percentage' ,labels = scales::percent)
p = p + theme_minimal()  + theme(text = element_text(size=12), axis.text.x = element_text(angle=90, hjust=1))  + guides(fill="none")
print(p)

Create simplified dataset by conflating VOC pango sublineages to VOC WHO label:

spsp_summary_week_redux = by(spsp_sub[,'pango_simpl'], list(spsp_sub$week), table)

df_sequed_week_redux = data.frame(Var1=character(), Freq=numeric(), week=character())
for (i in 1:length(spsp_summary_week_redux)) {
  dat = data.frame(spsp_summary_week_redux[[i]])
  dat$week = labels(spsp_summary_week_redux[i])
  df_sequed_week_redux = rbind(df_sequed_week_redux,dat)
}
colnames(df_sequed_week_redux)=c("pango","value","week")
df_sequed_week_redux = df_sequed_week_redux[df_sequed_week_redux$value > 0,]

df_sequed_week_redux$week = as.factor(df_sequed_week_redux$week)
df_sequed_week_redux$pango = as.factor(df_sequed_week_redux$pango)

data_redux = df_sequed_week_redux %>% group_by(week, pango, .drop=FALSE) %>% summarise(n=sum(value)) %>% mutate(percentage = n/sum(n))

mycol_redux = c(mycol, mycol['B.1.1.7'], mycol['B.1.617.2'], mycol['B.1.1.529'], mycol['BA.1'], mycol['BA.2'], mycol['BA.3'], mycol['BA.4'], mycol['BA.5'], mycol['P.1'], mycol["B.1.351"], mycol["XD"])
names(mycol_redux) = c(names(mycol), 'alpha', 'delta', 'omicron', 'omicron1', 'omicron2', 'omicron3', 'omicron4', 'omicron5', 'gamma', 'beta', 'omicron_rec')

# set colours for VOCs
mycol_voc = mycol_redux[vocs]

First detection of VOCs

Introduction week:

intro_all = unlist(lapply(vocs, function(this.voc){
  introweek = min(spsp_sub$week[str_detect(spsp_sub$pango_simpl, this.voc)])
  
  return(introweek)
  
}))
names(intro_all) = vocs
intro_all
##     alpha     delta   omicron 
## "2020-46" "2021-11" "2021-46"

Introduction date:

intro_all_date = lapply(vocs, function(this.voc){
  introdate = min(spsp_sub$date[str_detect(spsp_sub$pango_simpl, this.voc)])
  return(introdate)
})
intro_all_date = do.call('c', intro_all_date)
names(intro_all_date) = vocs
intro_all_date
##        alpha        delta      omicron 
## "2020-11-09" "2021-03-16" "2021-11-21"

Number of cases and percentage of sequenced cases at the time of introduction of each VOC:

df.atTimeOfIntro = data_cases2[data_cases2$week %in% intro_all & data_cases2$var %in% c('cases','percSeq'),]
df.atTimeOfIntro$voc = vocs

p = ggplot(df.atTimeOfIntro, aes(x=voc, y=n, fill=voc))
p = p + geom_bar(stat='identity', position=position_dodge(), alpha=0.9)
p = p + scale_fill_manual('voc', values=mycol_voc[c('alpha', 'delta', 'omicron')])
p = p + facet_wrap(vars(var), ncol=2, scale='free', labeller=labeller(var=c('cases'='# of positive cases', 'percSeq' = '% of cases sequenced')))
p = p + theme_minimal()+ theme(text = element_text(size=12), axis.text.x = element_text(angle=90, hjust=1)) + guides(fill="none")
p = p + xlab('') + ylab('') 
print(p)

Diversity of lineages

# number of lineages  per week
# use shannon diversity index per week
# calculate nucleotide diversity per week

weeks = unique(spsp_sub$week)

# all lineages
div_week = c() 
num_lin = c()
for (this.week in unique(data$week)){
  sub.data = data[data$week == this.week,]
  
  num_lin = c(num_lin, length(which(sub.data$n > 0)))
  div_week = c(div_week, diversity(sub.data$n))
  
}

df.diversity = data.frame(week=unique(data$week), diversity = div_week, num_lineages = num_lin)
df.diversity_long = gather(df.diversity, var, value, diversity:num_lineages)

# redux data set
div_week = c()
num_lin = c()
for (this.week in unique(data_redux$week)){
  sub.data = data_redux[data_redux$week == this.week,]
  count.omicron = length(which(str_detect(sub.data$pango[sub.data$n>0], 'omicron')))
  
  if(count.omicron > 1){ # if more than one omicron lineage
    notomicron = which(! str_detect(sub.data$pango, 'omicron'))
    sub.data_corr = sub.data[notomicron,]
    num.omicron = sub.data$n[str_detect(sub.data$pango, 'omicron')]
    sub.data_corr = bind_rows(sub.data_corr, data.frame(week=this.week, pango='omicron', n=sum(num.omicron), percentage=NA)) # percentage is irrelevant here

    num_lin = c(num_lin, length(which(sub.data_corr$n > 0)))
    div_week = c(div_week, diversity(sub.data_corr$n))
    
  }else{ # if only one omicron lineage or none
    num_lin = c(num_lin, length(which(sub.data$n > 0)))
    div_week = c(div_week, diversity(sub.data$n))
  }
  
  
}

df.diversity_redux = data.frame(week=unique(data_redux$week), diversity = div_week, num_lineages = num_lin)
df.diversity_redux_long = gather(df.diversity_redux, var, value, diversity:num_lineages)

In order to calculate the genetic diversity as measures by nucleotide diversity, we need all sequences per week, generate an alignment, and calculate the nucleotide diversity for each alignment. Dealing with the sequence data in R including the calculation of nucleotide diversity requires a lot of RAM and is better performed on a machine with the required memory. The alignment is also calculated on a HPC outside of R using mafft v 7.505.

Code for preparation of the data and calculation of nucleotide diversity:

# nucleotide diversity
# download sequences from gisaid (chunks of 10k)


# fetch gisaid IDs and write to file for download of sequence data from gisaid.org
index = lapply(seq(1, 150000, 10000), function(x){
  if (x==140001){
    return(seq(x, nrow(spsp_sub)))
  }else{
    return(seq(x, x+10000-1))
  }
})

gisaid_ids = lapply(seq_along(index), function(idx){
  return(spsp_sub$gisaid[index[[idx]]])
})

# remove gaps where we don't have a gisaid ID
gisaid_ids_clean = lapply(gisaid_ids, function(this.batch){
  if(any(str_detect(this.batch, '-'))){
    new = this.batch[-str_which(this.batch, '-')]
    return(new)
  }else{
    return(this.batch)
  }
})

for(i in 1:length(gisaid_ids_clean)){
  write.table(gisaid_ids_clean[[i]], file=paste0('data/gisaid_id_', i, '.txt'), sep='\t', col.names = F, row.names = F, quote=F)
}
# --> download data from gisaid.org


# read in sequences 
allseq = list.files('data/gisaid_download/')

# needs lots of RAM
l.allseq = list()
for (i in 1:length(allseq.files)){
  print(i)
  l.allseq[[i]] = read.fasta(paste0('gisaid_download/', allseq.files[i]), seqtype = 'DNA')
}
# concatenate all sequences to one file
allseq = unlist(l.allseq, recursive = F)
save(allseq, file='allseq.RData')
rm(l.allseq)

allgisaid = spsp_sub$gisaid
namesseq = names(allseq) %>% str_extract('EPI_ISL_[:digit:]+') # not all had sequences in gisaid

# there are a few sequences that were not downloaded from gisaid for whatever reason, remove them from list
gisaid_clean = spsp_sub[-which(!allgisaid %in% namesseq),]
gisaid_week = gisaid_clean %>% group_by(week) %>% group_map(~.x)
names(gisaid_week) = unique(gisaid_clean$week)


# get sequences per week and write to file to generate a multiple sequence alignment (per week) 
for (i in 1:length(gisaid_week)){
  this.week = gisaid_week[[i]]

  seqindex = which(namesseq %in% this.week$gisaid)
  this.week.fasta = allseq[seqindex]

  write.fasta(this.week.fasta, names=names(this.week.fasta) , file.out=paste0('data/seqs_week_', names(gisaid_week[i]), '.fasta'))
}
# ---> make whole genome alignments with mafft 

# read in alignments
allfiles = list.files('aln/')

l.msa = list()
for (i in 1:length(allfiles)){
  l.msa[[i]] = read.dna(paste0('aln/', allfiles[i]), format = 'fasta')
}
names(l.msa) = weeks

library(parallel)
library(pbmcapply) # wrapper for mclapply with progress bar

detectCores()

l.nucdiv = pbmclapply(l.msa, nuc.div, pairwise.deletion=T)
names(l.nucdiv) = weeks
save(l.nucdiv, file='l.nucdiv.RData')

Read in data for nucleotide diversity:

load('data/l.nucdiv.RData')

df.nucdiv = do.call(rbind, l.nucdiv)
colnames(df.nucdiv) = c('nucdiv')#, 'variance')
rownames(df.nucdiv) = NULL
df.nucdiv = bind_cols(week=as.factor(weeks), df.nucdiv)

Shannon diversity index

df.diversity_all = data.frame(week=df.diversity$week, nuc_div=df.nucdiv$nucdiv, shannon_div=df.diversity$diversity, shannon_div_redux=df.diversity_redux$diversity)
df.diversity_all_long = gather(df.diversity_all, var, value, nuc_div:shannon_div_redux)
mycol_div = c("#cdb5de", "#498975","#8cd6cf" )

p = ggplot(df.diversity_all_long, aes(x=week, y=value, group=var, fill=var))
p = p + geom_area(size=.2, alpha=.7)
p = p + scale_x_discrete('calendar week')
p = p + scale_fill_manual('var', values=mycol_div)
p = p + facet_wrap(vars(var), nrow=3, scales='free_y',
                   labeller = labeller(var=c('nuc_div' = 'Nucleotide diversity', 'shannon_div' = 'Shannon diversity (All Pango lineages)', 'shannon_div_redux' = 'Shannon diversity (With VOC lineages)'))) 
p = p + theme_minimal()  + theme(text = element_text(size=12), axis.text.x = element_text(angle=90, hjust=1))  + guides(fill="none") + ylab('')
print(p)

Number of lineages

df.num_lineages = data.frame(week=df.diversity$week, num_all=df.diversity$num_lineages, num_redux=df.diversity_redux$num_lineages)
df.num_lineages_long = gather(df.num_lineages, var, value, num_all:num_redux)

mycol_num = c('#e0b33d', '#677d3f')
p = ggplot(df.num_lineages_long, aes(x=week, y=value, group=var, fill=var))
p = p + geom_area(size=.2, alpha=.7)
# p = p + scale_fill_manual('pango', values=mycol)
p = p + scale_fill_manual('var', values=mycol_num)
p = p + scale_x_discrete('calendar week')
p = p + scale_y_continuous('Number of lineages')
p = p + facet_wrap(vars(var), nrow=2, scales='free_y',
                   labeller = labeller(var=c('num_all' = 'All pango lineages', 'num_redux' = 'With VOC lineages')))
p = p + theme_minimal()  + theme(text = element_text(size=12), axis.text.x = element_text(angle=90, hjust=1))  + guides(fill="none")
print(p)

Speed of introduction of VOCs

The speed of introduction is defined as the slope of the linear model of the growth period (in log scale).

# helper function to plot linear regression model with coefficients
# adapted from https://sejohnston.com/2012/08/09/a-quick-and-easy-function-to-plot-lm-results-in-r/
ggplotRegression <- function (fit, header) {
  
  require(ggplot2)
  
  ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
    geom_point() +
    stat_smooth(method = "lm", col = "red") +
    labs(title = paste(header, "Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
                       "Intercept =",signif(fit$coef[[1]],5 ),
                       " Slope =",signif(fit$coef[[2]], 5),
                       " P =",signif(summary(fit)$coef[2,4], 5))) +
    theme_minimal() + theme(plot.title = element_text(size=8))
}
# daily cases by lineage
spsp_summary_date_redux = by(spsp_sub[,'pango_simpl'], list(spsp_sub$date), table)

df_sequed_date_redux = data.frame(Var1=character(), Freq=numeric(), date=Date())
for (i in 1:length(spsp_summary_date_redux)) {
  dat = data.frame(spsp_summary_date_redux[[i]])
  dat$date = as.Date(labels(spsp_summary_date_redux[i]))
  df_sequed_date_redux = rbind(df_sequed_date_redux,dat)
}
colnames(df_sequed_date_redux)=c("pango","value","date")
df_sequed_date_redux = df_sequed_date_redux[df_sequed_date_redux$value > 0,]

df_sequed_date_redux$pango = as.factor(df_sequed_date_redux$pango)

# extract number of VOC sequences; conflate all omicron sublineages into one
l.vocs_seq = lapply(vocs, function(this.voc){
  tmp = df_sequed_date_redux[str_detect(df_sequed_date_redux$pango, this.voc),]
  
  # summarise all omicron types into one 
  if(this.voc == 'omicron'){
    tmp = tmp %>% group_by(date) %>% summarise(value=sum(value)) 
  }
  
  return(tmp)
})
names(l.vocs_seq) = vocs

Set boundary for growth period:

# set date for each VOC that limits the growth period 
t2 = c(alpha=as.Date('2021-03-01'), delta=as.Date('2021-08-01'), omicron=as.Date('2022-01-01'))

# filter data only for period of introduction/exponential growth 
l.vocs_seq_growth = lapply(vocs, function(this.voc){
  df.voc = l.vocs_seq[[this.voc]] %>% filter(date <= t2[this.voc])
  df.voc$log_value = log(df.voc$value)
  return(df.voc)
})
names(l.vocs_seq_growth) = vocs

# plotting
for(this.voc in vocs){
  this = l.vocs_seq[[this.voc]]
  
  p = ggplot(this, aes(x=date, y=value))
  p = p + geom_point()
  p = p + geom_vline(xintercept = t2[this.voc], color='blue')
  p = p + theme_minimal() + ggtitle(this.voc) 
  print(p)
}

Calculate linear model for each VOC:

l.vocs_speed = lapply(l.vocs_seq_growth, function(df){
  fit = lm(log_value ~ date, data=df)
})
names(l.vocs_speed) = vocs

# plotting
for (this.voc in vocs){
  fit = l.vocs_speed[[this.voc]]
  print(ggplotRegression(fit, this.voc))
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

Percentual growth in prevalence

# first calculate per day the percentage of each lineage - starting from df_sequed_date_redux
data_redux_dates = df_sequed_date_redux %>% group_by(date, pango, .drop=FALSE) %>% summarise(n=sum(value)) %>% mutate(percentage = n/sum(n))

# get only VOC data including the percentage, sum up all omicron sub variants into one
l.perc_vocs = lapply(vocs, function(this.voc){
  tmp = data_redux_dates[str_detect(data_redux_dates$pango, this.voc),]
  tmp=tmp[tmp$n>0,]
  
  # summarise all omicron types into one 
  if(this.voc == 'omicron'){
    tmp = tmp %>% group_by(date) %>% summarise(pango='omicron',n=sum(n), percentage=sum(percentage))
  }
  
  return(tmp)
})
names(l.perc_vocs) = vocs


# MIN PREVALENCE
min_prevalence = 0.5 # SET THIS


# threshold date for linear regression
minPrevDate = Date()
for (this.voc in vocs){
  tmp = l.perc_vocs[[this.voc]]
  minPrevDate = c(minPrevDate, min(tmp$date[tmp$percentage >= min_prevalence]))
}
names(minPrevDate) = vocs
rm(this.voc)
rm(tmp)

l.perc_vocs_growth = lapply(vocs, function(this.voc){
  df.voc = l.perc_vocs[[this.voc]] %>% filter(date <= minPrevDate[this.voc])
  df.voc$log_value = log(df.voc$percentage * 100)
  return(df.voc)
})
names(l.perc_vocs_growth) = vocs

Number of sequences for each VOC and date when a prevalence of 50% is reached (blue line):

for(this.voc in vocs){
  this = l.perc_vocs[[this.voc]]
  
  p = ggplot(this, aes(x=date, y=n))
  p = p + geom_point()
  p = p + geom_vline(xintercept = minPrevDate[this.voc], color='blue')
  p = p + theme_minimal() + ggtitle(this.voc) 
  print(p)
}

Downsampling of dataset

Downsampling the complete dataset of roughly 143,000 sequences to eight different sizes ranging from 2,500 to 100,000. This is done 100 times for each downsampling size. This chunk is very slow and should be run on a HPC. Results are saved into tsv files and are being read in (code chunk after).

x_range = c(2500, 5000, 7500, 10000, 25000, 50000, 75000, 100000)
n = 100

outfolder='data/spsp_data/downsampling/' # set outfolder name where downsampling results are being saved

Code for downsampling from scratch:

## DOWNSAMPLING FROM SCRATCH  

# helper function 
subsample_data = function(spsp_sub, x_range, n){
  vocs = c('alpha', 'delta', 'omicron')
  # cantons = unique(spsp_sub_canton$canton)

  alldates=lapply(x_range, function(x){
    df.dates = data.frame()
    for (i in 1:n){
      subsample_index = sample(c(1:nrow(spsp_sub)), x, replace = F)
      spsp_unif = spsp_sub[subsample_index,]

      # create subfolder and save samples files
      # dir.create(file.path(outfolder, x), showWarnings = F)
      # write.table(spsp_unif, file=paste0(outfolder, '/', x, '/spsp_', x/1000, 'k_', i, '.tsv'), col.names = T, row.names = F, quote=F, sep='\t')

      introdates_all = lapply(vocs, function(this.voc){
        introdate = min(spsp_unif$date[str_detect(spsp_unif$pango_simpl, this.voc)])
        return(introdate)
      })
      introdates_all = do.call('c', introdates_all)
      names(introdates_all) = vocs
      introdates_all = t(as.data.frame(introdates_all))
      df.dates = rbind(df.dates, introdates_all)
    }
    df.dates=cbind(n=c(1:n), df.dates)
    rownames(df.dates)=NULL

    return(df.dates)
  })
  names(alldates)=x_range

  df.alldates = bind_rows(alldates, .id='x')
  df.alldates$x = as.numeric(df.alldates$x)
  df.alldates$alpha = as.Date(df.alldates$alpha)
  df.alldates$delta = as.Date(df.alldates$delta)
  df.alldates$omicron = as.Date(df.alldates$omicron)
  return(df.alldates)
}


# downsampling
df.alldates = subsample_data(spsp_sub, x_range, n)

# get intro dates
summary_alldates = lapply(x_range, function(x){
  this = df.alldates[df.alldates$x == x,]
  median.alpha = median(this$alpha)
  median.delta = median(this$delta)
  median.omicron = median(this$omicron)

  min.alpha = min(this$alpha)
  min.delta = min(this$delta)
  min.omicron = min(this$omicron)

  max.alpha = max(this$alpha)
  max.delta = max(this$delta)
  max.omicron = max(this$omicron)

  return(data.frame(median.alpha, min.alpha, max.alpha,
                    median.delta, min.delta, max.delta,
                    median.omicron, min.omicron, max.omicron))

})
names(summary_alldates) = x_range
df.summary_alldates = bind_rows(summary_alldates, .id='x')


sd_alldates = aggregate(. ~ x, df.alldates[,c('x','alpha', 'delta', 'omicron')], function(x) c(sd = sd(x)))
colnames(sd_alldates) = c('x', 'sd.alpha', 'sd.delta', 'sd.omicron')

df.summary_alldates = bind_cols(df.summary_alldates, sd_alldates[,2:4])

# write data to files
write.table(df.summary_alldates, file=paste0(outfolder, '/introduction_vocs_subsampling_', n, 'x_summary.txt'), col.names = T, row.names = F, quote=F, sep='\t')
write.table(df.alldates, file=paste0(outfolder, '/introduction_vocs_subsampling_', n, 'x_dates.txt'), col.names = T, row.names = F, quote=F, sep='\t')

Read in downsampled datasets:

outfolder.dirs = paste0(outfolder, '/',x_range)
l.spsp_subsample = lapply(seq_along(x_range), function(k){
  x = x_range[k]
  this.dir = outfolder.dirs[k]
  print(this.dir)
  # read in 100 spsp tables of x
  list100 = lapply(1:n, function(i){
    read.table(paste0(this.dir, '/spsp_', x/1000, 'k_', i, '.tsv'), header=T, sep='\t')
  })
  
  return(list100)
})
## [1] "data/spsp_data/downsampling//2500"
## [1] "data/spsp_data/downsampling//5000"
## [1] "data/spsp_data/downsampling//7500"
## [1] "data/spsp_data/downsampling//10000"
## [1] "data/spsp_data/downsampling//25000"
## [1] "data/spsp_data/downsampling//50000"
## [1] "data/spsp_data/downsampling//75000"
## [1] "data/spsp_data/downsampling//1e+05"
names(l.spsp_subsample) = x_range

df.alldates = read.table(paste0(outfolder, 'introduction_vocs_subsampling_100x_dates.txt'), header=T, sep='\t')

# correct the wrong week formats for the first days of the year
for (i in 1:length(l.spsp_subsample)){
  for (k in 1:n){
    l.spsp_subsample[[i]][[k]]$week[l.spsp_subsample[[i]][[k]]$week == '2021-53'] = '2020-53'
    l.spsp_subsample[[i]][[k]]$week[l.spsp_subsample[[i]][[k]]$week == '2022-52'] = '2021-52'
  }
}

First detection of VOCs when downsampling

df.alldates_long = gather(df.alldates, variant, date, alpha:omicron)
df.alldates_long$x = as.factor(df.alldates_long$x)
df.alldates_long$date = as.Date(df.alldates_long$date)
df.alldates_long = rbind(df.alldates_long, data.frame(x=rep('all',3), n=rep(1,3), variant=vocs, date=intro_all_date))

# instead of dates use delay delta t
df.alldates_long$delay[df.alldates_long$variant == 'alpha'] = as.Date(intro_all_date['alpha']) - df.alldates_long$date[df.alldates_long$variant == 'alpha']
df.alldates_long$delay[df.alldates_long$variant == 'delta'] = as.Date(intro_all_date['delta']) - df.alldates_long$date[df.alldates_long$variant == 'delta']
df.alldates_long$delay[df.alldates_long$variant == 'omicron'] = as.Date(intro_all_date['omicron']) - df.alldates_long$date[df.alldates_long$variant == 'omicron']

df.alldates_long2 = df.alldates_long
df.alldates_long2$x = as.character(df.alldates_long2$x)
df.alldates_long2$x[df.alldates_long2$x == '1e+05'] = '100000'
df.alldates_long2$x = factor(df.alldates_long2$x, levels=unique(df.alldates_long2$x))

p = ggplot(df.alldates_long2, aes(x=x, y=abs(delay), fill=variant, alpha=x))
p = p + geom_boxplot()
p = p + facet_wrap(vars(variant), nrow=3, scales = 'fixed')
p = p + scale_alpha_discrete(range=c(0.1,1))
p = p + scale_fill_manual('variant', values = mycol_voc) + guides(fill='none', alpha='none')
p = p + ylab('Delay [days]') + xlab('Downsampling size')
p = p + theme_minimal() + coord_flip()
print(p)

Diversity when downsampling

l.diversity = lapply(l.spsp_subsample, function(this.x){
  
  l.div_x = lapply(this.x, function(this.spsp){
    this.summary_week = by(this.spsp[,'pango'], list(this.spsp$week), table)
    
    this.df_sequed_week= data.frame(Var1=character(), Freq=numeric(), week=character())
    for (i in 1:length(this.summary_week)) {
      dat = data.frame(this.summary_week[[i]])
      dat$week = labels(this.summary_week[i])
      this.df_sequed_week = rbind(this.df_sequed_week,dat)
    }
    colnames(this.df_sequed_week)=c("pango","value","week")
    this.df_sequed_week = this.df_sequed_week[this.df_sequed_week$value > 0,]
    
    this.df_sequed_week$week = as.factor(this.df_sequed_week$week)
    this.df_sequed_week$pango = as.factor(this.df_sequed_week$pango)
    
    this.data = this.df_sequed_week %>% group_by(week, pango, .drop=FALSE) %>% summarise(n=sum(value)) %>% mutate(percentage = n/sum(n))
    
    
    div_week = c()
    num_lin = c()
    for (this.week in unique(this.data$week)){
      sub.data = this.data[this.data$week == this.week,]
      
      num_lin = c(num_lin, length(which(sub.data$n > 0)))
      div_week = c(div_week, diversity(sub.data$n))
      
    }
    
    this.df.diversity = data.frame(week=unique(this.data$week), diversity = div_week, num_lineages = num_lin)
    
    return(this.df.diversity)
  })
  
  
  return(l.div_x)
  
})


l.diversity_df = lapply(l.diversity, function(this.x){
  bind_rows(this.x, .id='n')
})
df.diversity_subsample = bind_rows(l.diversity_df, .id='x')

df.diversity_subsample$week = factor(df.diversity_subsample$week, levels=c(df.diversity$week))
df.diversity_subsample$x = factor(df.diversity_subsample$x, levels=rev(unique(df.diversity_subsample$x)))

Shannon diversity index

p = ggplot(df.diversity_subsample, aes(x=week, y=diversity))
p = p + geom_area(data=df.diversity_all_long[df.diversity_all_long$var=='shannon_div',], aes(x=week, y=value, group=var, fill=var, alpha=0.1))
p = p + geom_boxplot()
p = p + scale_x_discrete('calendar week')
p = p + scale_y_continuous('Shannon diversity')
p = p + facet_wrap(vars(x), nrow=8)
p = p + theme_minimal() + theme(text = element_text(size=12), axis.text.x = element_text(angle=90, hjust=1))  + guides(fill="none", alpha='none')
print(p)

Number of lineages

p = ggplot(df.diversity_subsample, aes(x=week, y=num_lineages))
p = p + geom_area(data=df.num_lineages_long[df.num_lineages_long$var=='num_all',], aes(x=week, y=value, group=var, fill=var, alpha=0.1))
p = p + geom_boxplot()
p = p + scale_fill_manual('var', values=mycol_num)
p = p + scale_x_discrete('calendar week')
p = p + scale_y_continuous('Number of lineages')
p = p + facet_wrap(vars(x), nrow=8)
p = p + theme_minimal() + theme(text = element_text(size=12), axis.text.x = element_text(angle=90, hjust=1))  + guides(fill="none", alpha='none')
print(p)

Speed of introduction when downsampling

Calculate speed of introduction for each downsampled dataset (slow, run on HPC, save in R Object that can be read in):

l.speed_subsample = lapply(l.spsp_subsample, function(this.x){

  # transform each subsampled data frame and calculate linear model
  l.df_seq = lapply(this.x, function(this.spsp){
    this.spsp_summary = by(this.spsp[, 'pango_simpl'], list(this.spsp$date), table)

    df.seq = data.frame(Var1 = character(), Freq=numeric(), date=Date())
    for (i in 1:length(this.spsp_summary)){
      dat = data.frame(this.spsp_summary[[i]])
      dat$date = as.Date(labels(this.spsp_summary[i]))
      df.seq = rbind(df.seq, dat)
    }
    colnames(df.seq) = c('pango', 'value', 'date')
    df.seq = df.seq[df.seq$value > 0,]
    df.seq$pango = as.factor(df.seq$pango)

    return(df.seq)
  })

  l.vocs_all = lapply(l.df_seq, function(df.seq){
    l.vocs = lapply(vocs, function(this.voc){
      tmp = df.seq[str_detect(df.seq$pango, this.voc),]

      # summarise all omicron types into one
      if (this.voc == 'omicron'){
        tmp = tmp %>% group_by(date) %>% summarise(value=sum(value))
      }
      return(tmp)
    })
    names(l.vocs) = vocs
    return(l.vocs)
  })

  # filter only growth period
  l.vocs_all_growth = lapply(l.vocs_all, function(this.spsp){
    l.vocs_growth = lapply(vocs, function(this.voc){
      df.voc = this.spsp[[this.voc]] %>% filter(date <= t2[this.voc])
      df.voc$log_value = log(df.voc$value)
      return(df.voc)
    })
    names(l.vocs_growth) = vocs
    return(l.vocs_growth)
  })


  #calculate linear model for all datasets and vocs
  l.vocs_all_speed = lapply(l.vocs_all_growth, function(this.spsp){
    l.speed = lapply(vocs, function(this.voc){
      fit = lm(log_value ~ date, data=this.spsp[[this.voc]])
    })
    names(l.speed) = vocs
    return(l.speed)
  })

  return(l.vocs_all_speed)

})
names(l.speed_subsample) = x_range

save(l.speed_subsample, file=paste0('data/speed_subsample.RData'))

Load in saved R Object from previous calculation:

load('data/speed_subsample.RData')

# extract slope from all linear models
l.slopes_subsample = lapply(l.speed_subsample, function(this.x){
  l.slopes_this = lapply(this.x, function(this.spsp){
    data.frame(var = names(this.spsp), slope = c(this.spsp[[1]]$coefficients[2],
                                                 this.spsp[[2]]$coefficients[2],
                                                 this.spsp[[3]]$coefficients[2]))
    
  })
  
  df.slops = bind_rows(l.slopes_this)
  df.slops = cbind(n=rep(1:n, each=3), df.slops)
})
names(l.slopes_subsample) = x_range


df.slopes_subsample = bind_rows(l.slopes_subsample, .id = "x")
df.slopes_all = bind_rows(lapply(l.vocs_speed, function(this){return(slope=this$coefficients[2])}), .id='var')
df.slopes_all = bind_cols(x=rep('all', 3), n=rep(1, 3), df.slopes_all)
colnames(df.slopes_all) = c('x', 'n', 'var', 'slope')


df.slopes = bind_rows(df.slopes_subsample, df.slopes_all)
df.slopes$x = factor(df.slopes$x, levels=c('2500', '5000', '7500', '10000', '25000', '50000', '75000', '1e+05', 'all'))

df.slopes_subsample$x = factor(df.slopes_subsample$x, levels=c('2500', '5000', '7500', '10000', '25000', '50000', '75000', '1e+05'))
p = ggplot(df.slopes_subsample, aes(x=x, y=slope, fill=var, alpha=x))
p = p + geom_violin(alpha=0.5) + geom_point(alpha=0.5, position=position_jitter(width=0.1), size=0.2)
p = p + geom_hline(data=df.slopes_all, aes(yintercept = slope, color='red', value='all'))
p = p + facet_wrap(vars(var), ncol=3)
p = p + scale_fill_manual('var', values=mycol_voc)
p = p + scale_alpha_discrete(range=c(0.1,1))
p = p + theme_minimal(base_size = 15) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p = p + xlab('') + ylab('Speed of introduction [slope]') + guides(fill="none", colour='none') + theme(panel.border = element_rect(colour = "black", fill=NA)) 
print(p)

Time to reach 50% prevalence

l.perc_vocs_all = lapply(l.spsp_subsample, function(this.x){
  
  # transform each subsampled data frame
  l.df_seq = lapply(this.x, function(this.spsp){
    this.spsp_summary = by(this.spsp[, 'pango_simpl'], list(this.spsp$date), table)
    
    df.seq = data.frame(Var1 = character(), Freq=numeric(), date=Date())
    for (i in 1:length(this.spsp_summary)){
      dat = data.frame(this.spsp_summary[[i]])
      dat$date = as.Date(labels(this.spsp_summary[i]))
      df.seq = rbind(df.seq, dat)
    }
    colnames(df.seq) = c('pango', 'value', 'date')
    df.seq = df.seq[df.seq$value > 0,]
    df.seq$pango = as.factor(df.seq$pango)
    # calculate percentage
    df.seq_perc = df.seq %>% group_by(date, pango, .drop=FALSE) %>% summarise(n=sum(value)) %>% mutate(percentage = n/sum(n))
    # df.seq_perc = df.seq_perc[df.seq_perc$n > 0,]
    return(df.seq_perc)
  })
  
  # get only vocs and unify omicron
  l.perc_vocs_all = lapply(l.df_seq, function(df.seq){
    l.vocs = lapply(vocs, function(this.voc){
      tmp = df.seq[str_detect(df.seq$pango, this.voc),]
      tmp = tmp[tmp$n>0,]
      # summarise all omicron types into one 
      if (this.voc == 'omicron'){
        tmp = tmp %>% group_by(date) %>% summarise(pango='omicron',n=sum(n), percentage=sum(percentage))
      }
      return(tmp)
    })
    names(l.vocs) = vocs
    return(l.vocs)
  })

  
})
names(l.perc_vocs_all) = x_range


# time to reach min_prevalence = minPrevDate
l.minPrevDate_subsample = lapply(l.perc_vocs_all, function(this.x){
  l.minPrevDate_x = lapply(this.x, function(this.spsp){
    
    data.frame(var=names(this.spsp), minPrevDate=c(min(this.spsp[[1]]$date[this.spsp[[1]]$percentage >= min_prevalence]),
                                                   min(this.spsp[[2]]$date[this.spsp[[2]]$percentage >= min_prevalence]),
                                                   min(this.spsp[[3]]$date[this.spsp[[3]]$percentage >= min_prevalence])))
  })
  df.minPrevDate = bind_rows(l.minPrevDate_x)
  df.minPrevDate = cbind(n=rep(1:n, each=3), df.minPrevDate)
  
})
names(l.minPrevDate_subsample) = x_range

df.minPrevDate_subsample = bind_rows(l.minPrevDate_subsample, .id='x')
df.minPrevDate_subsample$x[df.minPrevDate_subsample$x == '1e+05'] = '100000'

df.alldates_long3 = df.alldates_long2[1:(nrow(df.alldates_long2)-3),]
df.minPrevDate_subsample_sort = df.minPrevDate_subsample[order(df.minPrevDate_subsample$var),]

df.introDelay_Prev = cbind(df.alldates_long3, minPrevDate=df.minPrevDate_subsample_sort$minPrevDate)
df.introDelay_Prev$timeToPrev = df.introDelay_Prev$minPrevDate - df.introDelay_Prev$date

# include values for complete dataset
df.introDelay_Prev_all = df.alldates_long[df.alldates_long$x=='all',]
df.introDelay_Prev_all$minPrevDate = minPrevDate
df.introDelay_Prev_all$timeToPrev = df.introDelay_Prev_all$minPrevDate - df.introDelay_Prev_all$date

df.introDelay_Prev_concat = rbind(df.introDelay_Prev, df.introDelay_Prev_all)
p = ggplot(df.introDelay_Prev_concat, aes(x=x, y=as.numeric(timeToPrev), fill=variant, alpha=x))
p = p + geom_boxplot()
p = p + facet_wrap(vars(variant), ncol=3)
p = p + scale_alpha_discrete(range=c(0.1,1))
p = p + scale_fill_manual('variant', values = mycol_voc) + guides(fill='none', alpha='none')
p = p + ylab(paste0('Time to ', min_prevalence*100, '% prevalence [days]')) + xlab('Downsampling size')
p = p + theme_minimal(base_size = 15) + theme(axis.text.x = element_text(angle=90, hjust=1)) + theme(panel.border = element_rect(colour = "black", fill=NA)) 
print(p)

Correlation between first detection (introduction) date and speed of introduction

df.introdates_tmp = df.alldates_long[-which(df.alldates_long$x %in% 'all'),]
df.introdates_tmp$x = factor(df.introdates_tmp$x, levels=c(as.character(x_range)))
df.speed_tmp = df.slopes_subsample

df.speed_tmp$newvar = paste0(df.speed_tmp$x, df.speed_tmp$n, df.speed_tmp$var)
df.introdates_tmp$newvar = paste0(df.introdates_tmp$x, df.introdates_tmp$n, df.introdates_tmp$variant)

df.speed_intro = df.speed_tmp[,1:5]
df.speed_intro$date = df.introdates_tmp$date[match(df.speed_tmp$newvar, df.introdates_tmp$newvar)]
df.speed_intro$delay[df.speed_intro$var == 'alpha'] = intro_all_date['alpha']-df.speed_intro$date[df.speed_intro$var == 'alpha']
df.speed_intro$delay[df.speed_intro$var == 'delta'] = intro_all_date['delta']-df.speed_intro$date[df.speed_intro$var == 'delta']
df.speed_intro$delay[df.speed_intro$var == 'omicron'] = intro_all_date['omicron']-df.speed_intro$date[df.speed_intro$var == 'omicron']

# plot introdate vs delay
p = ggplot(df.speed_intro, aes(x=abs(delay), y=slope, colour=var, alpha=0.7))
p = p + geom_point()
p = p + facet_wrap(vars(var, x), nrow=3)
p = p + scale_colour_manual('var', values=mycol_voc)
p = p + xlab('Delay [days]') + ylab('Speed of introduction [slope]') + guides(alpha="none", colour='none')
p = p + theme_minimal() + theme(panel.border = element_rect(colour = "black", fill=NA))
print(p)

Surveillance sequencing in Switzerland in comparison to other countries

International surveillance data has been download from gisaid.org.

meta = fread('data/metadata_tsv_2023_05_08/metadata.tsv') # download GISAID DATA from gisaid.org
pop = fread('data/RPOP_10112022144750906.csv') # population data 

# count sequences by country
country = meta$Location %>% str_extract('(?=\\/ ).+') %>% str_replace('^\\/ ', '') %>% str_extract('^[[:alpha:]]+([[:space:]][[:alpha:]]+)*')
country[is.na(country)] = 'USA'
tbl.country = as.data.frame(table(country))
tbl.country = tbl.country[order(-tbl.country$Freq),]
tbl.country$country = factor(tbl.country$country, levels=tbl.country$country)

# rank of Switzerland by absolute number of sequences
rank_CH = which(tbl.country$country =='Switzerland')  

# normalise sequencing by population
tbl.country_sub = tbl.country[tbl.country$Freq> 50000,]

# change country names as given in GISAID
pop$Country[pop$Country == 'Türkiye'] = 'Turkey'
pop$Country[pop$Country == 'Korea'] = 'South Korea'
pop$Country[pop$Country == 'United States'] = 'USA'

tbl.country_sub$freq.norm = tbl.country_sub$Freq/pop$Value[match(tbl.country_sub$country, pop$Country)]
tbl.country_sub = tbl.country_sub[order(-tbl.country_sub$freq.norm),]

# rank of Switzerland when normalised by population
rank_CH_norm = which(tbl.country_sub$country =='Switzerland')

Rank of Switzerland regarding surveillance sequencing, normalised by population:

## [1] 8
p = ggplot(tbl.country_sub, aes(x=reorder(country, -freq.norm), y=freq.norm))
p = p + geom_bar(stat='identity')
p = p + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(p)