It is very important that the user enter chemical formula easy and quick to the mass spectrometry data system. Modern high resolution mass spectrometry has about 7 to 8 decimal place accuracy so that is difficult to enter correrct mass value by numerical value. On the software side, formula brings more flexibility for automated estimation of adducts and to be consistant for electron mass is taking in account.
It seems that there are several approach been exist on the network, so this is yet another one someting practical for mass spectrometry data system.
On the mass spectrometry data system, user enter chemical formula for
Major purpos of the formula parser is simply count number of atoms. It means convert a user friendly format formula into atomic composition list. However, #6 of above use case require pritty formated string according to what user entered. Assume user enter following text:
CH3(C3H4(NH2)2)18CH3 CH3(13C3H4(NH2)2)18CH3 13CH3(C3H4(NH2)2)18CH3 CH3(C3H4(NH2)2)18 13CH3
Expected output for pritty print are:
CH3(C3H4(NH2)2)18CH3
CH3(13C3H4(NH2)2)18CH3
13CH3(C3H4(NH2)2)18CH3
CH3(C3H4(NH2)2)18 13CH3
On the mass spectrometry application, we need to determine isotopes such as 12C and 13C. An example, protein identification experiments uses H218O (heavy oxigen water) for hydrolysis reaction in order to find a quantity from the ratio of -16O and -18O associated peptides.
Full source code is implemented in qtplatz/src/libs/adportable/formula_parser.hpp, that is equivalent to the code at the bottom of the page.
Actual implementation on QtPlatz support both wchar_t and char as input, and rich text (HTML) output for pritty formated formula output. Mass value calculation is separately implemented as ChemicalFormula class object that contains table of element.
namespace adportable {
namespace chem {
namespace qi = boost::spirit::qi;
using boost::phoenix::bind;
using boost::spirit::ascii::space;
const char * element_table [] = {
"H", "He",
"Li", "Be", "B", "C", "N", "O", "F", "Ne",
"Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar",
"K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
"Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe",
"Cs", "Ba", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn",
"Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr",
// Lanthanoids
"La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er" "Tm", "Yb",
// Actinoids
"Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No"
};
typedef std::pair< int, const char * > atom_type;
typedef std::map< atom_type, int > comp_type;
template<typename Iterator
, typename handler = formulaComposition
, typename startType = comp_type>
struct chemical_formula_parser : qi::grammar< Iterator, startType() > {
chemical_formula_parser() : chemical_formula_parser::base_type( molecule )
, element( element_table, element_table ) {
molecule =
+ (
atoms [ phoenix::bind(&handler::formula_add, _val, qi::_1) ]
| repeated_group [ phoenix::bind(&handler::formula_join, _val, qi::_1 ) ]
| space
)
;
atoms =
atom >> ( qi::uint_ | qi::attr(1u) ) // default to 1
;
atom =
( qi::uint_ | qi::attr(0u) ) >> element
;
repeated_group %=
'(' >> molecule >> ')'
>> qi::omit[ qi::uint_[ phoenix::bind( handler::formula_repeat
, qi::_val
, qi::_1 ) ] ]
| '[' >> molecule >> ']'
>> qi::omit[ qi::uint_[ phoenix::bind( handler::formula_repeat
, qi::_val
, qi::_1 ) ] ]
;
}
qi::rule<Iterator, atom_type() > atom;
qi::rule<Iterator, std::pair< atom_type, int >() > atoms;
qi::rule<Iterator, startType()> molecule, repeated_group;
qi::symbols<char, const char *> element;
};
struct formulaComposition {
static void formula_add( comp_type& m, const std::pair<const atom_type, int>& p ) {
m[ p.first ] += p.second;
}
static void formula_join( comp_type& m, comp_type& a ) {
std::for_each( a.begin(), a.end(), [&]( comp_type::value_type& p ){
m[ p.first ] += p.second;
});
}
static void formula_repeat( comp_type& m, int n ) {
std::for_each( m.begin(), m.end(), [=]( comp_type::value_type& p ){
p.second *= n;
});
}
};
}
}
namespace test {
// for chemical formula formatter
static const char * braces [] = { "(", ")" };
using adportable::chem::atom_type;
typedef std::vector< std::pair< atom_type, size_t > > format_type;
struct formulaFormat {
static void formula_add( format_type& m
, const std::pair<const atom_type, std::size_t>& p ) {
m.push_back( p );
}
static void formula_join( format_type& m, format_type& a ) {
m.push_back( std::make_pair( atom_type( 0, braces[0] ), 0 ) );
for ( auto t: a )
m.push_back( t );
}
static void formula_repeat( format_type& m, std::size_t n ) {
m.push_back( std::make_pair( atom_type( 0, braces[1] ), n ) );
}
};
}
int
main(int argc, char * argv[])
{
(void)argc;
(void)argv;
std::string str;
std::wstring wstr;
namespace chem = adportable::chem;
namespace qi = boost::spirit::qi;
typedef
chem::chemical_formula_parser < std::string::const_iterator
, chem::formulaComposition
, chem::comp_type> composition_calculator_t;
typedef
chem::chemical_formula_parser < std::wstring::const_iterator
, test::formulaFormat
, test::format_type > formula_formatter_t;
while (std::getline(std::cin, str)) {
if (str.empty() || str[0] == 'q' || str[0] == 'Q')
break;
// compute compsition (ascii input)
do {
chem::comp_type comp;
auto it = str.begin();
auto end = str.end();
if ( qi::parse( it, end, composition_calculator_t(), comp ) && it == end ) {
std::cout << "-------------------------\n";
std::cout << "Parsing succeeded\n";
std::cout << str << " Parses OK: " << std::endl;
std::cout << str << " map size: " << comp.size() << std::endl;
for ( auto e: comp )
std::cout << e.first.first << " "
<< e.first.second << " "
<< e.second << std::endl;
std::cout << "-------------------------\n";
} else {
std::cout << "-------------------------\n";
std::cout << "Parsing failed\n";
std::cout << "-------------------------\n";
}
} while(0);
// pritty formatted text (wchar_t input)
do {
wstr.resize( str.size() );
std::copy( str.begin(), str.end(), wstr.begin() );
std::wcout << "as wide: " << wstr << std::endl;
test::format_type fmt;
auto it = wstr.begin();
auto end = wstr.end();
if ( qi::parse( it, end, formula_formatter_t(), fmt ) && it == end ) {
std::cout << "-------------------------\n";
std::cout << str << " Parses OK: " << std::endl;
for ( auto e: fmt ) {
if ( std::strcmp( e.first.second, "(" ) == 0 )
std::cout << "(";
else if ( std::strcmp( e.first.second, ")" ) == 0 )
std::cout << ")" << e.second;
else {
if ( e.first.first )
std::cout << "<sup>" << e.first.first << "</sup>";
std::cout << e.first.second;
if ( e.second > 1 )
std::cout << "<sub>" << e.second << "</sub>";
}
}
std::cout << "\n-------------------------\n";
} else {
std::cout << "-------------------------\n";
std::cout << "Parsing failed for\n";
std::cout << str;
std::cout << "-------------------------\n";
}
} while(0);
}
std::cout << "Bye... :-) \n\n";
return 0;
}
formula_parser < formulae.txt
-------------------------
Parsing succeeded
CH3(C3H4(NH2)2)18CH3 Parses OK:
CH3(C3H4(NH2)2)18CH3 map size: 3
0 H 150
0 C 56
0 N 36
-------------------------
as wide: CH3(C3H4(NH2)2)18CH3
-------------------------
CH3(C3H4(NH2)2)18CH3 Parses OK:
CH<sub>3</sub>(C<sub>3</sub>H<sub>4</sub>(NH<sub>2</sub>)2)18CH<sub>3</sub>
-------------------------
-------------------------
Parsing succeeded
CH3(13C3H4(NH2)2)18CH3 Parses OK:
CH3(13C3H4(NH2)2)18CH3 map size: 4
0 H 150
0 C 2
0 N 36
13 C 54
-------------------------
as wide: CH3(13C3H4(NH2)2)18CH3
-------------------------
CH3(13C3H4(NH2)2)18CH3 Parses OK:
CH<sub>3</sub>(<sup>13</sup>C<sub>3</sub>H<sub>4</sub>(NH<sub>2</sub>)2)18CH<sub>3</sub>
-------------------------
-------------------------
Parsing succeeded
13CH3(C3H4(NH2)2)18CH3 Parses OK:
13CH3(C3H4(NH2)2)18CH3 map size: 4
0 H 150
0 C 55
0 N 36
13 C 1
-------------------------
as wide: 13CH3(C3H4(NH2)2)18CH3
-------------------------
13CH3(C3H4(NH2)2)18CH3 Parses OK:
<sup>13</sup>CH<sub>3</sub>(C<sub>3</sub>H<sub>4</sub>(NH<sub>2</sub>)2)18CH<sub>3</sub>
-------------------------
-------------------------
Parsing succeeded
CH3(C3H4(NH2)2)18 13CH3 Parses OK:
CH3(C3H4(NH2)2)18 13CH3 map size: 4
0 H 150
0 C 55
0 N 36
13 C 1
-------------------------
as wide: CH3(C3H4(NH2)2)18 13CH3
-------------------------
CH3(C3H4(NH2)2)18 13CH3 Parses OK:
CH<sub>3</sub>(C<sub>3</sub>H<sub>4</sub>(NH<sub>2</sub>)2)18<sup>13</sup>CH<sub>3</sub>
-------------------------
Bye... :-)