@article{mbs:/content/journal/mgen/10.1099/mgen.0.000483, author = "Wang, Chao and Wu, Jin and Xu, Lei and Zou, Quan", title = "NonClasGP-Pred: robust and efficient prediction of non-classically secreted proteins by integrating subset-specific optimal models of imbalanced data", journal= "Microbial Genomics", year = "2020", volume = "6", number = "12", pages = "", doi = "https://doi.org/10.1099/mgen.0.000483", url = "https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000483", publisher = "Microbiology Society", issn = "2057-5858", type = "Journal Article", keywords = "model ensemble", keywords = "machine learning", keywords = "imbalanced dataset", keywords = "non-classically secreted proteins", keywords = "feature selection", eid = "e000483", abstract = "Non-classically secreted proteins (NCSPs) are proteins that are located in the extracellular environment, although there is a lack of known signal peptides or secretion motifs. They usually perform different biological functions in intracellular and extracellular environments, and several of their biological functions are linked to bacterial virulence and cell defence. Accurate protein localization is essential for all living organisms, however, the performance of existing methods developed for NCSP identification has been unsatisfactory and in particular suffer from data deficiency and possible overfitting problems. Further improvement is desirable, especially to address the lack of informative features and mining subset-specific features in imbalanced datasets. In the present study, a new computational predictor was developed for NCSP prediction of gram-positive bacteria. First, to address the possible prediction bias caused by the data imbalance problem, ten balanced subdatasets were generated for ensemble model construction. Then, the F-score algorithm combined with sequential forward search was used to strengthen the feature representation ability for each of the training subdatasets. Third, the subset-specific optimal feature combination process was adopted to characterize the original data from different aspects, and all subdataset-based models were integrated into a unified model, NonClasGP-Pred, which achieved an excellent performance with an accuracy of 93.23 %, a sensitivity of 100 %, a specificity of 89.01 %, a Matthew’s correlation coefficient of 87.68 % and an area under the curve value of 0.9975 for ten-fold cross-validation. Based on assessment on the independent test dataset, the proposed model outperformed state-of-the-art available toolkits. For availability and implementation, see: http://lab.malab.cn/~wangchao/softwares/NonClasGP/.", }