Filter samples/variables based on the conditions
Usage
filter_samples(object, flist, prune = TRUE, apply_to = "all")
filter_variables(
object,
flist,
prune = TRUE,
apply_to = "all",
according_to_samples = "all"
)Arguments
- object
(required) mass_dataset class object.
- flist
(required) A function or list of functions that take a vector of abundance values and return a logical.
- prune
(optional) A logical. Default
FALSE. IfTRUE, then the function returns the pruned mass_dataset-class object, rather than the logical vector of variables that passed the filter.- apply_to
(required) what variables you want to apply this function. Default is "all". If you only want to apply to specific variables, please set it as a vector of sample names. Other variables will be set as TRUE.
- according_to_samples
(required) What samples used to filter variables. Default is "all". If you want to use only several samples, provide they names as a vector.
Value
A logical vector equal to the number of samples/variables in mass_dataset-class.
Alternatively, if prune==TRUE, the pruned mass_dataset-class
object is returned instead.
Author
Xiaotao Shen xiaotao.shen@outlook.com
Examples
data("expression_data")
data("sample_info")
data("variable_info")
object =
create_mass_dataset(
expression_data = expression_data,
sample_info = sample_info,
variable_info = variable_info
)
filter_samples(object, function(x) {
sum(is.na(x)) / length(x) < 0.4
})
#> --------------------
#> massdataset version: 0.99.1
#> --------------------
#> 1.expression_data:[ 1000 x 2 data.frame]
#> 2.sample_info:[ 2 x 4 data.frame]
#> 2 samples:QC_1 QC_2
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 1000 variables:M136T55_2_POS M79T35_POS M307T548_POS ... M232T937_POS M301T277_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2026-03-02 09:27:07
#> filter_samples ----------
#> Package Function.used Time
#> 1 massdataset filter_samples() 2026-03-02 09:27:07
filter_samples(object, function(x) {
sum(is.na(x)) / length(x) < 0.4
}, prune = FALSE)
#> Blank_3 Blank_4 QC_1 QC_2 PS4P1 PS4P2 PS4P3 PS4P4
#> FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
##only apply to Subject sample
object2 =
filter_samples(
object = object,
flist = function(x) {
sum(is.na(x))/length(x) < 0.2
},
prune = TRUE,
apply_to = get_sample_id(object)[extract_sample_info(object)$class == "Subject"]
)
object2
#> --------------------
#> massdataset version: 0.99.1
#> --------------------
#> 1.expression_data:[ 1000 x 4 data.frame]
#> 2.sample_info:[ 4 x 4 data.frame]
#> 4 samples:Blank_3 Blank_4 QC_1 QC_2
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 1000 variables:M136T55_2_POS M79T35_POS M307T548_POS ... M232T937_POS M301T277_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2026-03-02 09:27:07
#> filter_samples ----------
#> Package Function.used Time
#> 1 massdataset filter_samples() 2026-03-02 09:27:07
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ forcats 1.0.0 ✔ stringr 1.5.1
#> ✔ lubridate 1.9.4 ✔ tibble 3.3.0
#> ✔ purrr 1.1.0 ✔ tidyr 1.3.1
#> ✔ readr 2.1.5
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ tidyr::extract() masks magrittr::extract()
#> ✖ dplyr::filter() masks massdataset::filter(), stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ✖ purrr::set_names() masks magrittr::set_names()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data("expression_data")
data("sample_info")
data("variable_info")
object =
create_mass_dataset(
expression_data = expression_data,
sample_info = sample_info,
variable_info = variable_info
)
object
#> --------------------
#> massdataset version: 0.99.1
#> --------------------
#> 1.expression_data:[ 1000 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 8 samples:Blank_3 Blank_4 QC_1 ... PS4P3 PS4P4
#> 3.variable_info:[ 1000 x 3 data.frame]
#> 1000 variables:M136T55_2_POS M79T35_POS M307T548_POS ... M232T937_POS M301T277_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 1 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2026-03-02 09:27:07
####Filter variables which have more than 50% MVs in all samples.
library(tidyverse)
filter_variables(object, function(x) {
sum(is.na(x)) / length(x) < 0.5
}, prune = FALSE) %>%
head()
#> M136T55_2_POS M79T35_POS M307T548_POS M183T224_POS M349T47_POS
#> TRUE TRUE TRUE FALSE TRUE
#> M182T828_POS
#> TRUE
filter_variables(object, function(x) {
sum(is.na(x)) / length(x) < 0.5
},
prune = TRUE)
#> --------------------
#> massdataset version: 0.99.1
#> --------------------
#> 1.expression_data:[ 422 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 8 samples:Blank_3 Blank_4 QC_1 ... PS4P3 PS4P4
#> 3.variable_info:[ 422 x 3 data.frame]
#> 422 variables:M136T55_2_POS M79T35_POS M307T548_POS ... M236T543_POS M232T937_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2026-03-02 09:27:07
#> filter_variables ----------
#> Package Function.used Time
#> 1 massdataset filter_variables() 2026-03-02 09:27:07
####Filter variables which have more than 50% MVs in only QC samples.
filter_variables(
object,
flist = function(x) {
sum(is.na(x)) / length(x) < 0.5
},
prune = TRUE,
according_to_samples =
get_sample_id(object)[extract_sample_info(object)$class == "QC"]
)
#> --------------------
#> massdataset version: 0.99.1
#> --------------------
#> 1.expression_data:[ 496 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 8 samples:Blank_3 Blank_4 QC_1 ... PS4P3 PS4P4
#> 3.variable_info:[ 496 x 3 data.frame]
#> 496 variables:M136T55_2_POS M79T35_POS M307T548_POS ... M361T681_POS M236T543_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2026-03-02 09:27:07
#> filter_variables ----------
#> Package Function.used Time
#> 1 massdataset filter_variables() 2026-03-02 09:27:07
####Filter variables which have more than 50% MVs in QC or subject samples.
idx1 =
filter_variables(
object,
flist = function(x) {
sum(is.na(x)) / length(x) < 0.5
},
prune = FALSE,
according_to_samples =
get_sample_id(object)[extract_sample_info(object)$class == "QC"]
)
idx2 =
filter_variables(
object,
flist = function(x) {
sum(is.na(x)) / length(x) < 0.5
},
prune = FALSE,
according_to_samples =
get_sample_id(object)[extract_sample_info(object)$class == "Subject"]
)
idx =
which(idx1 | idx2)
object2 = object[idx,]
object2
#> --------------------
#> massdataset version: 0.99.1
#> --------------------
#> 1.expression_data:[ 642 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 8 samples:Blank_3 Blank_4 QC_1 ... PS4P3 PS4P4
#> 3.variable_info:[ 642 x 3 data.frame]
#> 642 variables:M136T55_2_POS M79T35_POS M307T548_POS ... M232T937_POS M301T277_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2026-03-02 09:27:07
#> subset ----------
#> Package Function.used Time
#> 1 massdataset [ 2026-03-02 09:27:07
####filter variables with RSD (in QC samples) < 30
object3 =
filter_variables(
object = object,
flist = function(x) {
rsd = sd(x) * 100 / mean(x)
rsd = ifelse(is.na(rsd), 100, rsd)
rsd < 30
},
apply_to = "all",
prune = TRUE,
according_to_samples = get_sample_id(object)[extract_sample_info(object)$class == "QC"]
)
object3
#> --------------------
#> massdataset version: 0.99.1
#> --------------------
#> 1.expression_data:[ 328 x 8 data.frame]
#> 2.sample_info:[ 8 x 4 data.frame]
#> 8 samples:Blank_3 Blank_4 QC_1 ... PS4P3 PS4P4
#> 3.variable_info:[ 328 x 3 data.frame]
#> 328 variables:M307T548_POS M299T359_POS M344T471_POS ... M361T681_POS M236T543_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2026-03-02 09:27:07
#> filter_variables ----------
#> Package Function.used Time
#> 1 massdataset filter_variables() 2026-03-02 09:27:07
