Keep only samples with sample_data matching one or more conditions. By default this function also removes taxa which never appear in any of the remaining samples, by running tax_filter(min_prevalence = 1). You can prevent this taxa filtering with .keep_all_taxa = TRUE.
ps_filter(ps, ..., .target = "sample_data", .keep_all_taxa = FALSE)
phyloseq object (with filtered sample_data)
Use ps_filter as you would use use dplyr::filter(), but with a phyloseq object!
filter
explains better how to give arguments to this function
tax_filter
for filtering taxa (not samples)
library(phyloseq)
library(dplyr)
data("enterotype", package = "phyloseq")
enterotype
#> phyloseq-class experiment-level object
#> otu_table() OTU Table: [ 553 taxa and 280 samples ]
#> sample_data() Sample Data: [ 280 samples by 9 sample variables ]
#> tax_table() Taxonomy Table: [ 553 taxa by 1 taxonomic ranks ]
sample_data(enterotype)[1:10, 1:5]
#> Enterotype Sample_ID SeqTech SampleID Project
#> AM.AD.1 <NA> AM.AD.1 Sanger AM.AD.1 gill06
#> AM.AD.2 <NA> AM.AD.2 Sanger AM.AD.2 gill06
#> AM.F10.T1 <NA> AM.F10.T1 Sanger AM.F10.T1 turnbaugh09
#> AM.F10.T2 3 AM.F10.T2 Sanger AM.F10.T2 turnbaugh09
#> DA.AD.1 2 DA.AD.1 Sanger DA.AD.1 MetaHIT
#> DA.AD.1T <NA> DA.AD.1T Sanger <NA> <NA>
#> DA.AD.2 3 DA.AD.2 Sanger DA.AD.2 MetaHIT
#> DA.AD.3 3 DA.AD.3 Sanger DA.AD.3 MetaHIT
#> DA.AD.3T <NA> DA.AD.3T Sanger <NA> <NA>
#> DA.AD.4 2 DA.AD.4 Sanger DA.AD.4 MetaHIT
# keep only samples with seqtech not equal to sanger
ps1 <- ps_filter(enterotype, SeqTech != "Sanger")
ps1
#> phyloseq-class experiment-level object
#> otu_table() OTU Table: [ 552 taxa and 239 samples ]
#> sample_data() Sample Data: [ 239 samples by 9 sample variables ]
#> tax_table() Taxonomy Table: [ 552 taxa by 1 taxonomic ranks ]
sample_data(ps1)[1:10, 1:5]
#> Enterotype Sample_ID SeqTech SampleID Project
#> MH0001 2 MH0001 Illumina <NA> <NA>
#> MH0002 1 MH0002 Illumina <NA> <NA>
#> MH0003 1 MH0003 Illumina <NA> <NA>
#> MH0004 1 MH0004 Illumina <NA> <NA>
#> MH0005 2 MH0005 Illumina <NA> <NA>
#> MH0006 2 MH0006 Illumina <NA> <NA>
#> MH0007 1 MH0007 Illumina <NA> <NA>
#> MH0008 1 MH0008 Illumina <NA> <NA>
#> MH0009 3 MH0009 Illumina <NA> <NA>
#> MH0010 1 MH0010 Illumina <NA> <NA>
# keep only samples with no NAs in any variables
ps2 <- enterotype %>% ps_filter(!if_any(everything(), is.na))
ps2
#> phyloseq-class experiment-level object
#> otu_table() OTU Table: [ 221 taxa and 31 samples ]
#> sample_data() Sample Data: [ 31 samples by 9 sample variables ]
#> tax_table() Taxonomy Table: [ 221 taxa by 1 taxonomic ranks ]
sample_data(ps2)[1:8, 1:8]
#> Enterotype Sample_ID SeqTech SampleID Project Nationality Gender Age
#> DA.AD.1 2 DA.AD.1 Sanger DA.AD.1 MetaHIT danish F 59
#> DA.AD.2 3 DA.AD.2 Sanger DA.AD.2 MetaHIT danish M 54
#> DA.AD.3 3 DA.AD.3 Sanger DA.AD.3 MetaHIT danish F 49
#> DA.AD.4 2 DA.AD.4 Sanger DA.AD.4 MetaHIT danish M 59
#> ES.AD.1 1 ES.AD.1 Sanger ES.AD.1 MetaHIT spanish F 25
#> ES.AD.2 2 ES.AD.2 Sanger ES.AD.2 MetaHIT spanish M 49
#> ES.AD.3 2 ES.AD.3 Sanger ES.AD.3 MetaHIT spanish F 47
#> ES.AD.4 3 ES.AD.4 Sanger ES.AD.4 MetaHIT spanish F 38
# ps2 is equivalent to dropping samples with incomplete sample_variables and tax_filtering 0s
ps3 <- enterotype %>%
ps_drop_incomplete() %>%
tax_filter(undetected = 0, use_counts = FALSE)
# we needed to set a low detection threshold because this example data is proportions
identical(ps2, ps3) # TRUE
#> [1] TRUE
# function will give warning if some of the otu_values are negative
# (which may happen when filtering data that has e.g. clr-transformed taxa abundances)
# as it attempts to discard any taxa that become always absent/0 after filtering (by default)
# set .keep_all_taxa = TRUE to avoid this filtering behaviour, which is unwanted in this case
enterotype %>%
tax_transform("clr") %>%
ps_get() %>%
ps_filter(SeqTech == "Sanger", .keep_all_taxa = TRUE)
#> Warning: otu_table of counts is NOT available!
#> Available otu_table contains 50166 values that are not non-negative integers
#> phyloseq-class experiment-level object
#> otu_table() OTU Table: [ 553 taxa and 41 samples ]
#> sample_data() Sample Data: [ 41 samples by 9 sample variables ]
#> tax_table() Taxonomy Table: [ 553 taxa by 2 taxonomic ranks ]