Load Packages

library("phyloseq")
library("ggplot2")
library(plyr)
library("dplyr")
## Warning: package 'dplyr' was built under R version 3.5.1
library("tidyr")
library("RColorBrewer", lib.loc="/Library/Frameworks/R.framework/Versions/3.4/Resources/library")
library(gridExtra)
library(reshape2)
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
library("vegan")

1 Initial Relative Abundance BarPlot

load data (exported from qiime2 taxa-bar-plots.qzv)

seqper<- read.csv("level-3.csv")

Filter to only contain Acanatharea samples and then remove taxa that were only present in the environmental samples.

seqper <- seqper[seqper$Organism == "Acantharea",]
row.names(seqper) <- seqper$meta
seqperRA <-seqper[,-c(1,30:35)] 
seqperRA <-  seqperRA[,colSums(seqperRA) > 0]

Transform data to relative abundance for plotting.

tseqperRA <- data.frame(t(seqperRA))
sumcols<- colSums(tseqperRA)
for(i in 1:ncol(tseqperRA)) {
  tseqperRA[[i]]<- (tseqperRA[[i]]/sumcols[[i]])*100
}
tseqperRA<-round(tseqperRA, digits = 2)
#write.csv(tseqperRA, file="initrelabund.csv", quote = FALSE, row.names= TRUE )

prepare data for plotting

tseqperRA$row <-row.names(tseqperRA)
mSingles<- melt(tseqperRA, id.vars="row")

plot:

S2 <- brewer.pal(8, "Set2")
colors1 <- c("#50a33e", "#7eb38e", "#b1f41b", "#0e8121", "#5edbff", "#0097c3", "#0a3863", "#56B4E9", "#b2f5ff", "#0e5c63", "#1b8085", "#3aa6a9", "#ffb3b3", "#ff7b7b")
colors<- c( S2, colors1)

# put samples in more logical order on x axis and change label names
xorder = c("st2.A.1","st2.A.3","st2.A.5","st2.A.6","st2.A.7","st4.A.9","st10.A.10","st12.A.11","st12.A.12","st12.A.14","st12.A.15","st12.A.16","st13.A.18","st13.A.19","st13.A.20","st13.A.21","st17.A.22","st17.A.25","st17.A.27","st17.A.28", "st17.A.29","st17.A.31","st17.A.33","st17.A.34","st17.A.35","st17.A.36","st17.A.39","st17.A.40","st17.A.42","st17.A.43", "Onna.April.A3", "Onna.April.A4","Onna.May.A1","Onna.May.A3","Onna.May.A6","Onna.May.A7","Onna.May.A10","Onna.May.A11","Onna.May.A12", "Catalina.A.Ae","Catalina.A.C1","Catalina.A.C2" )
xnames = c("st2.1","st2.3","st2.5","st2.6","st2.7","st4.9","st10.10","st12.11","st12.12","st12.14","st12.15","st12.16","st13.18","st13.19","st13.20","st13.21","st17.22","st17.25","st17.27","st17.28", "st17.29","st17.31","st17.33","st17.34","st17.35","st17.36","st17.39","st17.40","st17.42","st17.43", "Oki.3A", "Oki.4A","Oki.1","Oki.3","Oki.6","Oki.7","Oki.10","Oki.11","Oki.12", "Cat.Ae","Cat.C1","Cat.C2")

relabund<- ggplot(mSingles, aes(x=variable, y=value, fill=row)) + 
  geom_bar(stat="identity") +
  xlab("") +
  ylab("") +
  theme_bw()+
  scale_x_discrete(limits = xorder, labels = xnames)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  theme(legend.title=element_blank()) +
  theme(legend.position="bottom") +
  guides(fill=guide_legend(ncol=2))+
  scale_y_continuous(expand = c(0, 0)) + scale_fill_manual(values=colors)
relabund + theme(legend.position="none") + 
  theme(text=element_text(size=16,  family="serif"))

2 Intra-host Symbiont Diversity

2.1 Load Data for Phyloseq

Load data and prepare dataframes to be imported as phyloseq objects.

SV Feature Table

singles <- read.delim("singles-feature-table-renamed.txt")

row.names(singles)<-singles[[1]]
singles<-singles[,-(1)]
singles<- singles[ , order(names(singles))]

#convert feature table to OTU phyloseq object 
fmat <- as.matrix(singles)
OTU = otu_table(fmat, taxa_are_rows = TRUE)

Taxonomy Table

taxonomy <- read.csv("taxonomy.csv", stringsAsFactors = FALSE)
names(taxonomy) <- c("row", "tax", "Confidence")
row.names(taxonomy) <-taxonomy[[1]]
taxonomy <- taxonomy[,(-1)]

taxonomy <-  separate(taxonomy, tax, c("D0","D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "D10", "D11", "D12", "D13", "D14"), sep = ";", fill = "right")
taxonomy <- taxonomy[,c(1:5)]
taxmat <- as.matrix(taxonomy)
TAX = tax_table(taxmat)

Sample Information

metatable <- read.csv("sampledata.csv")
row.names(metatable) <- metatable[[1]]
metatable$HostSV <- as.character(metatable$HostSV)
metatable<- metatable[,(-1)]

META<- sample_data(metatable)

Phylogenetic Tree

tree<- read_tree("tree.nwk")

Make Phyloseq Object

ps<- phyloseq(OTU, TAX, META, tree)

2.2 Preprocessing

2.2.1 Prevavence Filtering

prevdf = apply(X = otu_table(ps),
               MARGIN = ifelse(taxa_are_rows(ps), yes = 1, no = 2),
               FUN = function(x){sum(x > 0)})

prevdf = data.frame(Prevalence = prevdf,
                    TotalAbundance = taxa_sums(ps),
                    tax_table(ps))

plyr::ddply(prevdf, "D2", function(df1){cbind(mean(df1$Prevalence),sum(df1$Prevalence))})
##                           D2          1   2
## 1             Ambiguous_taxa 0.02564103   1
## 2             D_2__Alveolata 0.01602925  57
## 3        D_2__Ancyromonadida 4.00000000   8
## 4        D_2__Apusomonadidae 1.87500000  15
## 5        D_2__Chloroplastida 0.26582278  21
## 6       D_2__Cryptomonadales 0.22222222   2
## 7               D_2__Discoba 0.00000000   0
## 8              D_2__Discosea 1.00000000   2
## 9               D_2__Holozoa 0.21904762  46
## 10     D_2__Kathablepharidae 0.00000000   0
## 11          D_2__Nucletmycea 1.00000000   3
## 12                D_2__P1-31 0.00000000   0
## 13         D_2__Palpitomonas 0.06896552   2
## 14                 D_2__Pav3 0.00000000   0
## 15       D_2__Pavlovophyceae 0.00000000   0
## 16     D_2__Prymnesiophyceae 1.15730337 412
## 17             D_2__Rhizaria 0.84988453 368
## 18         D_2__Rhodophyceae 1.00000000   1
## 19        D_2__Stramenopiles 0.11607143  26
## 20             D_2__Telonema 0.08064516   5
## 21 D_2__uncultured eukaryote 0.00000000   0
## 22                      <NA> 0.10391198  85
prevplot1<-ggplot(prevdf, aes(TotalAbundance, Prevalence / nsamples(ps),color=D2)) +
  geom_hline(yintercept = 0.05, alpha = 0.5, linetype = 2) +  geom_point(size = 2, alpha = 0.7) + 
  theme_bw()+
  scale_x_log10() +  xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
  facet_wrap(~D2) + theme(legend.position="none")

Initial Prevalence Plot

## Warning: Transformation introduced infinite values in continuous x-axis

Filter at 5% prevalence (SVs below the dashed lines in the plots will be discarded) )

prevalenceThreshold = 0.05 * nsamples(ps)

keepTaxa = rownames(prevdf)[(prevdf$Prevalence >= prevalenceThreshold)]
ps2 = prune_taxa(keepTaxa, ps)

table(tax_table(ps2)[, "D2"], exclude = NULL)
## 
##   D_2__Ancyromonadida   D_2__Apusomonadidae   D_2__Chloroplastida 
##                     1                     2                     1 
##          D_2__Holozoa D_2__Prymnesiophyceae         D_2__Rhizaria 
##                     3                    21                    26 
##    D_2__Stramenopiles                  <NA> 
##                     2                     5

Check out the NA samples

D2s <- c("D_2__Ancyromonadida", "D_2__Apusomonadidae", "D_2__Chloroplastida", 
         "D_2__Holozoa", "D_2__Prymnesiophyceae", "D_2__Rhizaria", "D_2__Stramenopiles")
physeqNA <- subset_taxa(ps2, !(D2 %in% D2s))
physeqNA
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 5 taxa and 42 samples ]
## sample_data() Sample Data:       [ 42 samples by 3 sample variables ]
## tax_table()   Taxonomy Table:    [ 5 taxa by 5 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 5 tips and 4 internal nodes ]
naOTU <- data.frame(otu_table(physeqNA))

blasting the 5 NA sequences revealed that they are most likely host-derived. The top hits for each were Acantharea.

2.2.2 Taxonomic Filtering

Remove Host SVs (SAR) and remaining prey (Opisthokonta) by filtering to keep only Haptophyte SVs.

physeqP <- subset_taxa(ps2, D1=="D_1__Haptophyta")
physeqP
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 21 taxa and 42 samples ]
## sample_data() Sample Data:       [ 42 samples by 3 sample variables ]
## tax_table()   Taxonomy Table:    [ 21 taxa by 5 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 21 tips and 20 internal nodes ]

21 symbiotic SVs remain

2.2.3 Transformation

Transform count data to relative abundance (as %) to normalize for differences in library size.

physeqPra<- transform_sample_counts(physeqP, function(OTU) 100* OTU/sum(OTU))

2.3 Distance and Ordination

Determine the Bray-Curtis distances between samples and perform Principal Coordinate Analysis (PCoA). Plot the PCoA.

ordu = ordinate(physeqPra, "PCoA", "bray")
p<-plot_ordination(physeqPra, ordu, color="Location", shape = "HostSV")+theme_bw() +scale_color_manual(values=cbPalette)+ geom_point(size=3)+
  theme(text=element_text(size=16,  family="serif"))

2.4 Significance Testing

PERMANOVA with Vegan function adonis by collection location:

set.seed(1)
OTUs <- t(data.frame(otu_table(physeqPra))) #get data frame of symbiont SVs from phyloseq object object
meta <- metatable[metatable$Source=="host",] # filter sample data to include ONLY the samples included in this analysis. Otherwise, adonis will give an error. 

adonis(vegdist(OTUs, method = "bray") ~ Location, data = meta)
## 
## Call:
## adonis(formula = vegdist(OTUs, method = "bray") ~ Location, data = meta) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## Location   7    3.1581 0.45116  4.2343 0.46574  0.001 ***
## Residuals 34    3.6227 0.10655         0.53426           
## Total     41    6.7808                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

and by Host SV type:

set.seed(1)
adonis(vegdist(OTUs, method = "bray") ~ HostSV, data = meta)
## 
## Call:
## adonis(formula = vegdist(OTUs, method = "bray") ~ HostSV, data = meta) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)   
## HostSV     4    1.3050 0.32625  2.2045 0.19246  0.009 **
## Residuals 37    5.4758 0.14800         0.80754          
## Total     41    6.7808                 1.00000          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Export filtered SV table to perforem pairwise comparisons in Q2.

OTUs4Q2 <- data.frame(otu_table(physeqPra))
OTUs4Q2 <- cbind(rownames(OTUs4Q2), data.frame(OTUs4Q2, row.names=NULL))
colnames(OTUs4Q2)[1] <- "OTU ID"
write.table(OTUs4Q2, file='feature-table-2.txt', quote=FALSE, sep='\t', row.names = FALSE)

Export sample data for Q2 because sample “-” in sample names have changed to “.”

meta4Q2 <- cbind(rownames(meta), data.frame(meta, row.names=NULL))
colnames(meta4Q2)[1] <- "SampleID"
write.table(meta4Q2, file='metatable.tsv', quote=FALSE, sep='\t', row.names = FALSE) 

2.5 Relative Abundance Plot

symbioOTUs <- data.frame(otu_table(physeqPra)) #get data frame of symbiotic OTUs from phyloseq object
symbioOTUs$row <- row.names(symbioOTUs)

#make data frame of Symbiont OTUs with original counts
singles <- read.delim("singles-feature-table-renamed.txt")
singlesyms <- data.frame(symbioOTUs[,43])
names(singlesyms) <- c("row")
syms <- merge(singlesyms, singles, by="row")

#make dataframe for merging with environmental samples dataframe later
syms4filters <-syms
row.names(syms4filters) <-syms4filters$row
syms4filters<-syms4filters[,(-1)]
syms4filters<- syms4filters[ , order(names(syms4filters))]
MiraiAs <- syms4filters[,13:42]
MiraiAs$row <- rownames(MiraiAs)

#convert to relative abundance
row.names(syms) <-syms$row
singlesRA <- syms[,(-1)]
sumcols <- colSums(singlesRA)
for(i in 1:ncol(singlesRA)) {
  singlesRA[[i]]<- (singlesRA[[i]]/sumcols[[i]])*100
}
syms<- singlesRA

#assign SVs ids based on their location on the phylogenetic tree. This will ensure that the SVs appear in the same order in the stacked bar plots as they do in the phylogenetic tree and will make it easier to look back and forth between the two plots
id <- c("S", "J", "K", "O", "N", "F", "M", "P", "R","A","D","B","H","E","I","G","T","C","U","L","Q")
syms$id2 <- id 

#melt data frame for plotting
msyms<- melt(syms, id.vars="id2")

# put samples in more logical order on x axis and change label names
xorder = c("st2.A.1","st2.A.3","st2.A.5","st2.A.6","st2.A.7","st4.A.9","st10.A.10","st12.A.11","st12.A.12","st12.A.14","st12.A.15","st12.A.16","st13.A.18","st13.A.19","st13.A.20","st13.A.21","st17.A.22","st17.A.25","st17.A.27","st17.A.28", "st17.A.29","st17.A.31","st17.A.33","st17.A.34","st17.A.35","st17.A.36","st17.A.39","st17.A.40","st17.A.42","st17.A.43", "Onna.April.A3", "Onna.April.A4","Onna.May.A1","Onna.May.A3","Onna.May.A6","Onna.May.A7","Onna.May.A10","Onna.May.A11","Onna.May.A12", "Catalina.A.Ae","Catalina.A.C1","Catalina.A.C2" )
xnames = c("st2.1","st2.3","st2.5","st2.6","st2.7","st4.9","st10.10","st12.11","st12.12","st12.14","st12.15","st12.16","st13.18","st13.19","st13.20","st13.21","st17.22","st17.25","st17.27","st17.28", "st17.29","st17.31","st17.33","st17.34","st17.35","st17.36","st17.39","st17.40","st17.42","st17.43", "Oki.3A", "Oki.4A","Oki.1","Oki.3","Oki.6","Oki.7","Oki.10","Oki.11","Oki.12", "Cat.Ae","Cat.C1","Cat.C2")
barorder = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21")
#custom color scheme
colors <- c("#cd4a4a", "#9f1c3b", "#790a1a", "#878787", "#5e5e5e", "#292929", "#95cde8", "#4897d8", "#217ca3", "#bbfaff", "#3aa6a9", "#1b8085","#fff671", "#ffd662", "#f5dfb8", "#f5ba98", "#f77d75", "#bedb92", "#77c063", "#569358", "#065900") 

#plot relative abundances of symbiotic SVs in acantharian hosts
HostSymRA<- ggplot(msyms, aes(x=variable, y=value, fill=id2)) + 
  geom_bar(stat="identity") +
  xlab("") +
  ylab("") +
  theme_bw()+
  scale_x_discrete(limits = xorder, labels = xnames)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  theme(legend.title=element_blank()) +
  scale_y_continuous(expand = c(0, 0)) +
  scale_fill_manual(values=colors,  labels = barorder)

Figure 2

Legend for Figures 2 and 3

g_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  legend
}
legend <- g_legend(HostSymRA)
SVlegend <- grid.arrange(legend)

2.6 Number of SVs per Host

#count number rof non-zero rows for each column to get number of symbiotic SVs per host
nonzero <- function(x) sum(x != 0)
Svscnt<-as.data.frame(numcolwise(nonzero)(syms))
avg = as.integer(apply(Svscnt,1,mean,na.rm=TRUE)) +1
sd = as.integer(apply(Svscnt,1,sd,na.rm=TRUE)) 
min= apply(Svscnt,1,min,na.rm=TRUE) 
max= apply(Svscnt,1,max,na.rm=TRUE)
## [1] "The mean number of SVs per host is  8 +/- 2 (standard deviation)."
## [1] "There are 4 to 12 SVs per host."

3 Environmental Symbiont Diversity

3.1 Load Data

Load data and prepare dataframes to be imported as phyloseq objects.

filters <- read.delim("feature-table-filters.txt")
#filter feature table to include only the 0.2 um filters
names.use <- c("OTU","st2.DNA.0.2.1","st2.DNA.0.2.2","st4.DNA.0.2.1","st4.DNA.0.2.2","st10.DNA.0.2.1","st10.DNA.0.2.2","st12.DNA.0.2.1","st12.DNA.0.2.2","st13.DNA.0.2.1","st13.DNA.0.2.2","st17.DNA.0.2.1","st17.DNA.0.2.2" )
filters <- filters[,names.use]

3.2 Preprocessing

3.2.1 Filter Symbiotic SVs

Filter dataframe to contain the symbiotic SVs from acantharian hosts.

row.names(filters) <- filters$OTU
filters<- filters[,-(1)]
filters$row <- row.names(filters)
new<- merge(MiraiAs, filters, by = "row") #MiraiAs is dataframe of counts of symbiont SVs in the acantharians collected on Mirai

Convert filtered feature table to OTU phyloseq object.

row.names(new)<-new$row
new <- new[,-1]
fmat <- as.matrix(new)
OTU = otu_table(fmat, taxa_are_rows = TRUE)
physeq<- phyloseq(OTU, TAX, META, tree) #The taxonomy, metadata, and phylogenetic tree are the same as for the acantharian samples

3.2.2 Transformation

Transform count data to relative abundance (as %) to normalize for differences in library size.

physeqPra<- transform_sample_counts(physeq, function(OTU) OTU/sum(OTU))

3.3 Distance and Ordination

Determine the Bray-Curtis distances between samples and perform Principal Coordinate Analysis (PCoA). Plot the PCoA.

ordu = ordinate(physeqPra, "PCoA", "bray")
colors <- c("black", "#0F52BA")
p1<-plot_ordination(physeqPra, ordu, color="Source", shape = "Location") + geom_point(size=3.5)+theme_bw() +scale_color_manual(values=colors) + 
  theme(text=element_text(size=16,  family="serif"))

3.4 Significance Testing

PERMANOVA with Vegan function adonis by sample source (Environment or Host):

OTUs <- t(data.frame(otu_table(physeqPra))) #get data frame of OTUs from phyloseq object
mirai<- c("st2", "st4", "st10", "st12", "st13", "st17")
meta <- metatable[metatable$Location %in% mirai,] #make sure metadata only has the samples included in the feature table
metaA <- meta[meta$Source == "host",]
metaF <- meta[meta$Source == "environment",]
names.use <- c("st2.DNA.0.2.1","st2.DNA.0.2.2","st4.DNA.0.2.1","st4.DNA.0.2.2","st10.DNA.0.2.1","st10.DNA.0.2.2","st12.DNA.0.2.1","st12.DNA.0.2.2","st13.DNA.0.2.1","st13.DNA.0.2.2","st17.DNA.0.2.1","st17.DNA.0.2.2" )
metaF<- metaF[names.use,]
meta <- rbind(metaA, metaF)

set.seed(1)

adonis(vegdist(OTUs, method = "bray") ~ Source, data = meta)
## 
## Call:
## adonis(formula = vegdist(OTUs, method = "bray") ~ Source, data = meta) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## Source     1    2.1458 2.14576  19.378 0.32634  0.001 ***
## Residuals 40    4.4294 0.11073         0.67366           
## Total     41    6.5751                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.5 Relative Abundance Plot

keep<-MiraiAs
keep$id2<-id
keep <- data.frame(keep[,c(31,32)])

envRA <-merge(filters, keep, by = "row")
row.names(envRA) <- envRA$id2
envRA <- envRA[,-c(1,14)]

sumcols <- colSums(envRA)
for(i in 1:ncol(envRA)) {
  envRA[[i]]<- (envRA[[i]]/sumcols[[i]])*100
}

envRA$id2 <- row.names(envRA)
menvRA<- melt(envRA, id.vars="id2")

# put samples in more logical order on x axis and change label names
filterxorder = c("st2.DNA.0.2.1","st2.DNA.0.2.2","st4.DNA.0.2.1","st4.DNA.0.2.2","st10.DNA.0.2.1","st10.DNA.0.2.2","st12.DNA.0.2.1","st12.DNA.0.2.2","st13.DNA.0.2.1","st13.DNA.0.2.2","st17.DNA.0.2.1","st17.DNA.0.2.2" )
filterxnames = c("st2.1","st2.2","st4.1","st4.2","st10.1","st10.2","st12.1","st12.2","st13.1","st13.2","st17.1","st17.2" )
colors <- c("#cd4a4a", "#9f1c3b", "#790a1a", "#878787", "#5e5e5e", "#292929", "#95cde8", "#4897d8", "#217ca3", "#bbfaff", "#3aa6a9", "#1b8085","#fff671", "#ffd662", "#f5dfb8", "#f5ba98", "#f77d75", "#bedb92", "#77c063", "#569358", "#065900") 

envSymRA<- ggplot(menvRA, aes(x=variable, y=value, fill=id2)) + 
  geom_bar(stat="identity") +
  xlab("") +
  ylab("") +
  scale_x_discrete(limits = filterxorder, labels = filterxnames)+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_fill_manual(values=colors)+
  theme(legend.title=element_blank()) +
  scale_y_continuous(expand = c(0, 0)) + theme(legend.position="none")

Figure 3

4 Complete Environmental Community

How good are the replicates when condsidering the entire environmental microbial eukaryote community?

4.1 Load Data

filters <- read.delim("feature-table-filters.txt")
#filter feature table to include only the 0.2 um filters
names.use <- c("OTU","st2.DNA.0.2.1","st2.DNA.0.2.2","st4.DNA.0.2.1","st4.DNA.0.2.2","st10.DNA.0.2.1","st10.DNA.0.2.2","st12.DNA.0.2.1","st12.DNA.0.2.2","st13.DNA.0.2.1","st13.DNA.0.2.2","st17.DNA.0.2.1","st17.DNA.0.2.2" )
filters <- filters[,names.use]
#convert feature table to OTU phyloseq object 
row.names(filters)<- filters[[1]]
filters<-filters[,-(1)]
fmat <- as.matrix(filters)

Convert to phyloseq object

OTU = otu_table(fmat, taxa_are_rows = TRUE)
ps<- phyloseq(OTU, TAX, META, tree)

4.2 Preprocessing

4.2.1 Prevalence Filtering

prevdf = apply(X = otu_table(ps),
               MARGIN = ifelse(taxa_are_rows(ps), yes = 1, no = 2),
               FUN = function(x){sum(x > 0)})

prevdf = data.frame(Prevalence = prevdf,
                    TotalAbundance = taxa_sums(ps),
                    tax_table(ps))

plyr::ddply(prevdf, "D2", function(df1){cbind(mean(df1$Prevalence),sum(df1$Prevalence))})
##                           D2         1    2
## 1             Ambiguous_taxa 5.4615385  213
## 2             D_2__Alveolata 1.6718223 5945
## 3        D_2__Ancyromonadida 0.0000000    0
## 4        D_2__Apusomonadidae 0.0000000    0
## 5        D_2__Chloroplastida 1.2911392  102
## 6       D_2__Cryptomonadales 0.6666667    6
## 7               D_2__Discoba 0.1111111    1
## 8              D_2__Discosea 0.0000000    0
## 9               D_2__Holozoa 0.5857143  123
## 10     D_2__Kathablepharidae 0.2222222    2
## 11          D_2__Nucletmycea 0.0000000    0
## 12                D_2__P1-31 6.0000000   12
## 13         D_2__Palpitomonas 1.8965517   55
## 14                 D_2__Pav3 0.0000000    0
## 15       D_2__Pavlovophyceae 1.0000000    1
## 16     D_2__Prymnesiophyceae 2.4438202  870
## 17             D_2__Rhizaria 0.3233256  140
## 18         D_2__Rhodophyceae 0.0000000    0
## 19        D_2__Stramenopiles 1.7767857  398
## 20             D_2__Telonema 0.9354839   58
## 21 D_2__uncultured eukaryote 1.9090909   21
## 22                      <NA> 0.7053790  577
envPrevplot<- ggplot(prevdf, aes(TotalAbundance, Prevalence / nsamples(ps),color=D1)) + geom_hline(yintercept = 0.05, alpha = 0.5, linetype = 2) +  geom_point(size = 2, alpha = 0.7) +theme_bw()+
  scale_x_log10() +  xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
  facet_wrap(~D1) + theme(legend.position="none")
## Warning: Transformation introduced infinite values in continuous x-axis

Apply Prevalence Filter

prevalenceThreshold = 0.05 * nsamples(ps)
keepTaxa = rownames(prevdf)[(prevdf$Prevalence >= prevalenceThreshold)]
ps2 = prune_taxa(keepTaxa, ps)

4.2.2 Taxonomic Filtering

Remove SVs without Phylum classification.

keeps<- c("D_1__Archaeplastida", "D_1__Cryptophyceae", "D_1__Haptophyta", "D_1__Incertae Sedis", "D_1__Opisthokonta", "D_1__Picozoa", "D_1__SAR")
ps2 <- subset_taxa(ps2, D1 %in% keeps)
table(tax_table(ps2)[, "D3"], exclude = NULL)
## 
##            Ambiguous_taxa                     D_3__ 
##                        18                        33 
##          D_3__Bicosoecida             D_3__Cercozoa 
##                         2                        13 
##          D_3__Chlorophyta     D_3__Choanoflagellida 
##                        37                        26 
##           D_3__Ciliophora       D_3__Dinoflagellata 
##                       233                       195 
##  D_3__Labyrinthulomycetes         D_3__Leucocryptos 
##                         3                         2 
##               D_3__MAST-1              D_3__MAST-11 
##                         3                         2 
##               D_3__MAST-3               D_3__MAST-7 
##                         7                        11 
##               D_3__MAST-8               D_3__MAST-9 
##                         1                         3 
##   D_3__Metazoa (Animalia)           D_3__Ochrophyta 
##                        24                        23 
##              D_3__Pavlova          D_3__Phaeocystis 
##                         1                         5 
##        D_3__Protalveolata         D_3__Prymnesiales 
##                      1387                       172 
##              D_3__Retaria            D_3__RM2-SGM51 
##                        44                         3 
## D_3__uncultured eukaryote                      <NA> 
##                        29                       268

4.2.3 Transformation

Transform count data to relative abundance (as %) to normalize for differences in library size.

ps2ra<- transform_sample_counts(ps2, function(OTU) OTU/sum(OTU))

4.3 Distance and Ordination

Determine the Bray-Curtis distances between samples and perform Principal Coordinate Analysis (PCoA). Plot the PCoA.

cbPalette <- c("#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ordu = ordinate(ps2ra, "PCoA", "bray")
p3<-plot_ordination(ps2ra, ordu, color="Location") + geom_point(size=3.5) +scale_color_manual(values=cbPalette) +theme_bw() + 
  theme(text=element_text(size=16,  family="serif"))