@article{mbs:/content/journal/micro/10.1099/mic.0.2007/006205-0, author = "Ibrahim, Mariam and Nicolas, Pierre and Bessières, Philippe and Bolotin, Alexander and Monnet, Véronique and Gardan, Rozenn", title = "A genome-wide survey of short coding sequences in streptococci", journal= "Microbiology", year = "2007", volume = "153", number = "11", pages = "3631-3644", doi = "https://doi.org/10.1099/mic.0.2007/006205-0", url = "https://www.microbiologyresearch.org/content/journal/micro/10.1099/mic.0.2007/006205-0", publisher = "Microbiology Society", issn = "1465-2080", type = "Journal Article", keywords = "SHP, short hydrophobic peptide", keywords = "LC-MS/MS, liquid chromatography–tandem MS", keywords = "CT, threshold cycle", keywords = "HMM, hidden Markov model", keywords = "CDS, coding sequence", keywords = "spCDS, short putative CDS", keywords = "pHMM, probability of a ‘true positive’ prediction after decoding with HMM", abstract = "Identification of short genes that encode peptides of fewer than 60 aa is challenging, both experimentally and in silico. As a consequence, the universe of these short coding sequences (CDSs) remains largely unknown, although some are acknowledged to play important roles in cell–cell communication, particularly in Gram-positive bacteria. This paper reports a thorough search for short CDSs across streptococcal genomes. Our bioinformatic approach relied on a combination of advanced intrinsic and extrinsic methods. In the first step, intrinsic sequence information (nucleotide composition and presence of RBSs) served to identify new short putative CDSs (spCDSs) and to eliminate the differences between annotation policies. In the second step, pseudogene fragments and false predictions were filtered out. The last step consisted of screening the remaining spCDSs for lines of extrinsic evidence involving sequence and gene-context comparisons. A total of 789 spCDSs across 20 complete genomes (19 Streptococcus and one Enterococcus) received the support of at least one line of extrinsic evidence, which corresponds to an average of 20 short CDSs per million base pairs. Most of these had no known function, and a significant fraction (31 %) are not even annotated as hypothetical genes in GenBank records. As an illustration of the value of this list, we describe a new family of CDSs, encoding very short hydrophobic peptides (20–23 aa) situated just upstream of some of the positive transcriptional regulators of the Rgg family. The expression of seven other short CDSs from Streptococcus thermophilus CNRZ1066 that encode peptides ranging in length from 41 to 56 aa was confirmed by real-time quantitative RT-PCR and revealed a variety of expression patterns. Finally, one peptide from this list, encoded by a gene that is not annotated in GenBank, was identified in a cell-envelope-enriched fraction of S. thermophilus CNRZ1066.", }