Data preparation
After the raw data processing
, peak tables for positive and negative mode have been generated.
Next, we need to get the peak table and sample information and organize them as mass_dataset
class objects.
library(tidymass)
#> Registered S3 method overwritten by 'Hmisc':
#> method from
#> vcov.default fit.models
#> ── Attaching packages ──────────────────────────────────────── tidymass 1.0.9 ──
#> ✔ massdataset 1.0.34 ✔ metid 1.2.33
#> ✔ massprocesser 1.0.10 ✔ masstools 1.0.13
#> ✔ masscleaner 1.0.12 ✔ dplyr 1.1.4
#> ✔ massqc 1.0.7 ✔ ggplot2 3.5.1
#> ✔ massstat 1.0.6 ✔ magrittr 2.0.3
#> ✔ metpath 1.0.8
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ forcats 1.0.0 ✔ readr 2.1.5
#> ✔ lubridate 1.9.3 ✔ stringr 1.5.1
#> ✔ purrr 1.0.2 ✔ tibble 3.2.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ xcms::collect() masks dplyr::collect()
#> ✖ MSnbase::combine() masks Biobase::combine(), BiocGenerics::combine(), dplyr::combine()
#> ✖ tidyr::expand() masks S4Vectors::expand()
#> ✖ tidyr::extract() masks magrittr::extract()
#> ✖ metid::filter() masks metpath::filter(), dplyr::filter(), massdataset::filter(), stats::filter()
#> ✖ S4Vectors::first() masks dplyr::first()
#> ✖ xcms::groups() masks dplyr::groups()
#> ✖ dplyr::lag() masks stats::lag()
#> ✖ purrr::map() masks mixOmics::map()
#> ✖ BiocGenerics::Position() masks ggplot2::Position(), base::Position()
#> ✖ purrr::reduce() masks MSnbase::reduce()
#> ✖ S4Vectors::rename() masks dplyr::rename(), massdataset::rename()
#> ✖ lubridate::second() masks S4Vectors::second()
#> ✖ lubridate::second<-() masks S4Vectors::second<-()
#> ✖ MASS::select() masks dplyr::select(), massdataset::select()
#> ✖ purrr::set_names() masks magrittr::set_names()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Positive mode
Load object
.
load("mzxml_ms1_data/POS/Result/object")
object_pos <- object
object_pos
#> --------------------
#> massdataset version: 0.99.8
#> --------------------
#> 1.expression_data:[ 10149 x 259 data.frame]
#> 2.sample_info:[ 259 x 4 data.frame]
#> 259 samples:sample_06 sample_103 sample_11 ... sample_QC_38 sample_QC_39
#> 3.variable_info:[ 10149 x 3 data.frame]
#> 10149 variables:M70T73_POS M70T53_POS M70T183_POS ... M923T55_POS M992T641_POS
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-02-23 08:37:06
#> process_data ----------
#> Package Function.used Time
#> 1 massprocesser process_data 2022-02-23 08:36:42
Read sample information.
sample_info_pos <- readr::read_csv("sample_info/sample_info_pos.csv")
#> Rows: 259 Columns: 6
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (4): sample_id, class, subject_id, group
#> dbl (2): injection.order, batch
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(sample_info_pos)
#> # A tibble: 6 × 6
#> sample_id injection.order class batch subject_id group
#> <chr> <dbl> <chr> <dbl> <chr> <chr>
#> 1 sample_QC_01 1 QC 1 <NA> QC
#> 2 sample_01 2 Subject 1 subject_474 Control
#> 3 sample_02 3 Subject 1 subject_431 Control
#> 4 sample_06 4 Subject 1 subject_414 Case
#> 5 sample_07 5 Subject 1 subject_830 Control
#> 6 sample_11 6 Subject 1 subject_125 Case
Add sample_info_pos
to object_pos
object_pos %>%
extract_sample_info() %>%
head()
#> sample_id group class injection.order
#> 1 sample_06 Case Subject 1
#> 2 sample_103 Case Subject 2
#> 3 sample_11 Case Subject 3
#> 4 sample_112 Case Subject 4
#> 5 sample_117 Case Subject 5
#> 6 sample_12 Case Subject 6
object_pos <-
object_pos %>%
activate_mass_dataset(what = "sample_info") %>%
dplyr::select(-c("group", "class", "injection.order"))
object_pos =
object_pos %>%
activate_mass_dataset(what = "sample_info") %>%
left_join(sample_info_pos,
by = "sample_id")
object_pos %>%
extract_sample_info() %>%
head()
#> sample_id injection.order class batch subject_id group
#> 1 sample_06 4 Subject 1 subject_414 Case
#> 2 sample_103 71 Subject 1 subject_330 Case
#> 3 sample_11 6 Subject 1 subject_125 Case
#> 4 sample_112 78 Subject 1 subject_295 Case
#> 5 sample_117 80 Subject 1 subject_793 Case
#> 6 sample_12 8 Subject 1 subject_387 Case
Save the object_pos
in a new folder named as data_cleaning
.
dir.create("data_cleaning/POS", showWarnings = FALSE, recursive = TRUE)
save(object_pos, file = "data_cleaning/POS/object_pos")
object_pos
#> --------------------
#> massdataset version: 0.99.8
#> --------------------
#> 1.expression_data:[ 10149 x 259 data.frame]
#> 2.sample_info:[ 259 x 6 data.frame]
#> 259 samples:sample_06 sample_103 sample_11 ... sample_QC_38 sample_QC_39
#> 3.variable_info:[ 10149 x 3 data.frame]
#> 10149 variables:M70T73_POS M70T53_POS M70T183_POS ... M923T55_POS M992T641_POS
#> 4.sample_info_note:[ 6 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-02-23 08:37:06
#> process_data ----------
#> Package Function.used Time
#> 1 massprocesser process_data 2022-02-23 08:36:42
dim(object_pos)
#> variables samples
#> 10149 259
object_pos %>%
activate_mass_dataset(what = "sample_info") %>%
dplyr::count(class)
#> class n
#> 1 QC 39
#> 2 Subject 220
object_pos %>%
activate_mass_dataset(what = "sample_info") %>%
dplyr::count(group)
#> group n
#> 1 Case 110
#> 2 Control 110
#> 3 QC 39
object_pos %>%
activate_mass_dataset(what = "sample_info") %>%
dplyr::count(batch)
#> batch n
#> 1 1 112
#> 2 2 147
So for positive mode, we have 259 samples and 10,149 variables. 220 subject samples and 39 QC samples. 110 control samples and 110 case samples. Two batches in total, 112 samples in batch 1 and 147 in batch 2.
Next, we can get the peak distributation plot of positive mode.
object_pos %>%
`+`(1) %>%
log(10) %>%
show_mz_rt_plot() +
scale_size_continuous(range = c(0.01, 2))

We can explore the missing values (mvs) in positive mode data.
get_mv_number(object = object_pos)
#> [1] 785821
785,821 mvs in total.
get_mv_number(object = object_pos, by = "sample") %>%
head()
#> sample_06 sample_103 sample_11 sample_112 sample_117 sample_12
#> 4016 2711 4063 2981 2919 3844
Missing value number in each sample.
get_mv_number(object = object_pos, by = "variable") %>%
head()
#> M70T73_POS M70T53_POS M70T183_POS M70T527_POS M71T695_POS M71T183_POS
#> 129 16 155 54 133 169
Missing value number in each variable.
We can use the figure to show the missing value information.
show_missing_values(object = object_pos, show_column_names = FALSE, percentage = TRUE)

Show the mvs in samples.
show_sample_missing_values(object = object_pos, percentage = TRUE)

Show the mvs in variables
show_variable_missing_values(
object = object_pos,
percentage = TRUE,
show_x_text = FALSE,
show_x_ticks = FALSE
) +
scale_size_continuous(range = c(0.01, 1))

Negative mode
Load object
.
load("mzxml_ms1_data/NEG/Result/object")
object_neg <- object
object_neg
#> --------------------
#> massdataset version: 0.99.8
#> --------------------
#> 1.expression_data:[ 8804 x 259 data.frame]
#> 2.sample_info:[ 259 x 4 data.frame]
#> 259 samples:sample_06 sample_103 sample_11 ... sample_QC_38 sample_QC_39
#> 3.variable_info:[ 8804 x 3 data.frame]
#> 8804 variables:M70T712_NEG M70T527_NEG M70T587_NEG ... M941T65_NEG M948T641_NEG
#> 4.sample_info_note:[ 4 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-02-23 08:38:19
#> process_data ----------
#> Package Function.used Time
#> 1 massprocesser process_data 2022-02-23 08:38:02
Read sample information.
sample_info_neg <-
readr::read_csv("sample_info/sample_info_neg.csv")
#> Rows: 259 Columns: 6
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (4): sample_id, class, subject_id, group
#> dbl (2): injection.order, batch
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(sample_info_neg)
#> # A tibble: 6 × 6
#> sample_id injection.order class batch subject_id group
#> <chr> <dbl> <chr> <dbl> <chr> <chr>
#> 1 sample_QC_01 1 QC 1 <NA> QC
#> 2 sample_01 2 Subject 1 subject_474 Control
#> 3 sample_02 3 Subject 1 subject_431 Control
#> 4 sample_06 4 Subject 1 subject_414 Case
#> 5 sample_07 5 Subject 1 subject_830 Control
#> 6 sample_11 6 Subject 1 subject_125 Case
Add sample_info_neg
to object_neg
object_neg %>%
extract_sample_info() %>%
head()
#> sample_id group class injection.order
#> 1 sample_06 Case Subject 1
#> 2 sample_103 Case Subject 2
#> 3 sample_11 Case Subject 3
#> 4 sample_112 Case Subject 4
#> 5 sample_117 Case Subject 5
#> 6 sample_12 Case Subject 6
object_neg <-
object_neg %>%
activate_mass_dataset(what = "sample_info") %>%
dplyr::select(-c("group", "class", "injection.order"))
object_neg <-
object_neg %>%
activate_mass_dataset(what = "sample_info") %>%
left_join(sample_info_neg,
by = "sample_id")
object_neg %>%
extract_sample_info() %>%
head()
#> sample_id injection.order class batch subject_id group
#> 1 sample_06 4 Subject 1 subject_414 Case
#> 2 sample_103 71 Subject 1 subject_330 Case
#> 3 sample_11 6 Subject 1 subject_125 Case
#> 4 sample_112 78 Subject 1 subject_295 Case
#> 5 sample_117 80 Subject 1 subject_793 Case
#> 6 sample_12 8 Subject 1 subject_387 Case
Save the object_neg
in a new folder named as data_cleaning
.
dir.create("data_cleaning/NEG", showWarnings = FALSE, recursive = TRUE)
save(object_neg, file = "data_cleaning/NEG/object_neg")
object_neg
#> --------------------
#> massdataset version: 0.99.8
#> --------------------
#> 1.expression_data:[ 8804 x 259 data.frame]
#> 2.sample_info:[ 259 x 6 data.frame]
#> 259 samples:sample_06 sample_103 sample_11 ... sample_QC_38 sample_QC_39
#> 3.variable_info:[ 8804 x 3 data.frame]
#> 8804 variables:M70T712_NEG M70T527_NEG M70T587_NEG ... M941T65_NEG M948T641_NEG
#> 4.sample_info_note:[ 6 x 2 data.frame]
#> 5.variable_info_note:[ 3 x 2 data.frame]
#> 6.ms2_data:[ 0 variables x 0 MS2 spectra]
#> --------------------
#> Processing information
#> 2 processings in total
#> create_mass_dataset ----------
#> Package Function.used Time
#> 1 massdataset create_mass_dataset() 2022-02-23 08:38:19
#> process_data ----------
#> Package Function.used Time
#> 1 massprocesser process_data 2022-02-23 08:38:02
dim(object_neg)
#> variables samples
#> 8804 259
object_neg %>%
activate_mass_dataset(what = "sample_info") %>%
dplyr::count(class)
#> class n
#> 1 QC 39
#> 2 Subject 220
object_neg %>%
activate_mass_dataset(what = "sample_info") %>%
dplyr::count(group)
#> group n
#> 1 Case 110
#> 2 Control 110
#> 3 QC 39
object_neg %>%
activate_mass_dataset(what = "sample_info") %>%
dplyr::count(batch)
#> batch n
#> 1 1 86
#> 2 2 173
So for negative mode, we have 259 samples and 8,804 variables. 220 subject samples and 39 QC samples. 110 control samples and 110 case samples. Two batches in total, 112 samples in batch 1 and 147 in batch 2.
Next, we can get the peak distributation plot of negative mode.
object_neg %>%
`+`(1) %>%
log(10) %>%
show_mz_rt_plot() +
scale_size_continuous(range = c(0.01, 2))

We can explore the missing values in negitive mode data.
get_mv_number(object = object_neg)
#> [1] 748253
748,253 mvs in total.
get_mv_number(object = object_neg, by = "sample") %>%
head()
#> sample_06 sample_103 sample_11 sample_112 sample_117 sample_12
#> 3029 2766 3298 3100 2912 3053
Missing value number in each sample.
get_mv_number(object = object_neg, by = "variable") %>%
head()
#> M70T712_NEG M70T527_NEG M70T587_NEG M70T47_NEG M71T587_NEG M71T641_NEG
#> 16 137 2 146 41 19
Missing value number in each variable.
We can use the figure to show the missing value information.
show_missing_values(object = object_neg, show_column_names = FALSE, percentage = TRUE)

Show the mvs in samples.
show_sample_missing_values(object = object_neg, percentage = TRUE)

Show the mvs in variables.
show_variable_missing_values(object = object_neg,
percentage = TRUE,
show_x_text = FALSE,
show_x_ticks = FALSE) +
scale_size_continuous(range = c(0.01, 1))

Conclusion
So from those exploration, we have a brief summary of our data. Next, we will use masscleaner
pacakge to do the data cleaning of data.
Session information
sessionInfo()
#> R version 4.4.1 (2024-06-14)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS 15.0
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> time zone: Asia/Singapore
#> tzcode source: internal
#>
#> attached base packages:
#> [1] grid stats4 stats graphics grDevices utils datasets
#> [8] methods base
#>
#> other attached packages:
#> [1] lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
#> [4] purrr_1.0.2 readr_2.1.5 tibble_3.2.1
#> [7] tidyverse_2.0.0 metid_1.2.33 metpath_1.0.8
#> [10] ComplexHeatmap_2.20.0 mixOmics_6.28.0 lattice_0.22-6
#> [13] MASS_7.3-61 massstat_1.0.6 tidyr_1.3.1
#> [16] ggfortify_0.4.17 massqc_1.0.7 masscleaner_1.0.12
#> [19] MSnbase_2.30.1 ProtGenerics_1.36.0 S4Vectors_0.42.1
#> [22] Biobase_2.64.0 BiocGenerics_0.50.0 mzR_2.38.0
#> [25] Rcpp_1.0.13 xcms_4.2.3 BiocParallel_1.38.0
#> [28] massprocesser_1.0.10 ggplot2_3.5.1 dplyr_1.1.4
#> [31] magrittr_2.0.3 masstools_1.0.13 massdataset_1.0.34
#> [34] tidymass_1.0.9
#>
#> loaded via a namespace (and not attached):
#> [1] fs_1.6.4 matrixStats_1.3.0
#> [3] bitops_1.0-8 fit.models_0.64
#> [5] httr_1.4.7 RColorBrewer_1.1-3
#> [7] doParallel_1.0.17 tools_4.4.1
#> [9] doRNG_1.8.6 backports_1.5.0
#> [11] utf8_1.2.4 R6_2.5.1
#> [13] lazyeval_0.2.2 GetoptLong_1.0.5
#> [15] withr_3.0.1 prettyunits_1.2.0
#> [17] gridExtra_2.3 preprocessCore_1.66.0
#> [19] cli_3.6.3 Cairo_1.6-2
#> [21] fastDummies_1.7.4 labeling_0.4.3
#> [23] sass_0.4.9 mvtnorm_1.3-1
#> [25] robustbase_0.99-4 randomForest_4.7-1.1
#> [27] proxy_0.4-27 pbapply_1.7-2
#> [29] foreign_0.8-87 rrcov_1.7-6
#> [31] MetaboCoreUtils_1.12.0 parallelly_1.38.0
#> [33] itertools_0.1-3 limma_3.60.4
#> [35] readxl_1.4.3 rstudioapi_0.16.0
#> [37] impute_1.78.0 generics_0.1.3
#> [39] shape_1.4.6.1 vroom_1.6.5
#> [41] zip_2.3.1 Matrix_1.7-0
#> [43] MALDIquant_1.22.3 fansi_1.0.6
#> [45] abind_1.4-5 lifecycle_1.0.4
#> [47] yaml_2.3.10 SummarizedExperiment_1.34.0
#> [49] SparseArray_1.4.8 crayon_1.5.3
#> [51] PSMatch_1.8.0 KEGGREST_1.44.1
#> [53] magick_2.8.4 pillar_1.9.0
#> [55] knitr_1.48 GenomicRanges_1.56.1
#> [57] rjson_0.2.22 corpcor_1.6.10
#> [59] codetools_0.2-20 glue_1.7.0
#> [61] pcaMethods_1.96.0 data.table_1.16.0
#> [63] remotes_2.5.0 MultiAssayExperiment_1.30.3
#> [65] vctrs_0.6.5 png_0.1-8
#> [67] cellranger_1.1.0 gtable_0.3.5
#> [69] cachem_1.1.0 xfun_0.47
#> [71] openxlsx_4.2.7 S4Arrays_1.4.1
#> [73] tidygraph_1.3.1 pcaPP_2.0-5
#> [75] ncdf4_1.23 iterators_1.0.14
#> [77] statmod_1.5.0 bit64_4.0.5
#> [79] robust_0.7-5 progress_1.2.3
#> [81] GenomeInfoDb_1.40.1 rprojroot_2.0.4
#> [83] bslib_0.8.0 affyio_1.74.0
#> [85] rpart_4.1.23 colorspace_2.1-1
#> [87] DBI_1.2.3 Hmisc_5.1-3
#> [89] nnet_7.3-19 tidyselect_1.2.1
#> [91] bit_4.0.5 compiler_4.4.1
#> [93] MassSpecWavelet_1.70.0 htmlTable_2.4.3
#> [95] DelayedArray_0.30.1 plotly_4.10.4
#> [97] bookdown_0.40 checkmate_2.3.2
#> [99] scales_1.3.0 DEoptimR_1.1-3
#> [101] affy_1.82.0 digest_0.6.37
#> [103] rmarkdown_2.28 XVector_0.44.0
#> [105] htmltools_0.5.8.1 pkgconfig_2.0.3
#> [107] base64enc_0.1-3 MatrixGenerics_1.16.0
#> [109] highr_0.11 fastmap_1.2.0
#> [111] rlang_1.1.4 GlobalOptions_0.1.2
#> [113] htmlwidgets_1.6.4 UCSC.utils_1.0.0
#> [115] farver_2.1.2 jquerylib_0.1.4
#> [117] jsonlite_1.8.8 MsExperiment_1.6.0
#> [119] mzID_1.42.0 RCurl_1.98-1.16
#> [121] Formula_1.2-5 GenomeInfoDbData_1.2.12
#> [123] patchwork_1.2.0 munsell_0.5.1
#> [125] viridis_0.6.5 MsCoreUtils_1.16.1
#> [127] vsn_3.72.0 furrr_0.3.1
#> [129] stringi_1.8.4 ggraph_2.2.1
#> [131] zlibbioc_1.50.0 plyr_1.8.9
#> [133] parallel_4.4.1 listenv_0.9.1
#> [135] ggrepel_0.9.5 Biostrings_2.72.1
#> [137] MsFeatures_1.12.0 graphlayouts_1.1.1
#> [139] hms_1.1.3 Spectra_1.14.1
#> [141] circlize_0.4.16 igraph_2.0.3
#> [143] QFeatures_1.14.2 rngtools_1.5.2
#> [145] reshape2_1.4.4 XML_3.99-0.17
#> [147] evaluate_0.24.0 blogdown_1.19
#> [149] BiocManager_1.30.25 tzdb_0.4.0
#> [151] foreach_1.5.2 missForest_1.5
#> [153] tweenr_2.0.3 polyclip_1.10-7
#> [155] future_1.34.0 clue_0.3-65
#> [157] ggforce_0.4.2 AnnotationFilter_1.28.0
#> [159] e1071_1.7-14 RSpectra_0.16-2
#> [161] ggcorrplot_0.1.4.1 viridisLite_0.4.2
#> [163] class_7.3-22 rARPACK_0.11-0
#> [165] memoise_2.0.1 ellipse_0.5.0
#> [167] IRanges_2.38.1 cluster_2.1.6
#> [169] timechange_0.3.0 globals_0.16.3
#> [171] here_1.0.1