Source code for hgvs.parser

# -*- coding: utf-8 -*-
"""Provides parser for HGVS strings and HGVS-related conceptual
components, such as intronic-offset coordiates

"""

from __future__ import absolute_import, division, print_function, unicode_literals

import logging
import copy
import re

from pkg_resources import resource_filename

import bioutils.sequences
import ometa.runtime
import parsley

from hgvs.exceptions import HGVSParseError

# The following imports are referenced by fully-qualified name in the
# hgvs grammar.
import hgvs.enums
import hgvs.edit
import hgvs.hgvsposition
import hgvs.location
import hgvs.posedit
import hgvs.sequencevariant


[docs]class Parser(object): """Provides comprehensive parsing of HGVS varaint strings (*i.e.*, variants represented according to the Human Genome Variation Society recommendations) into Python representations. The class wraps a Parsing Expression Grammar, exposing rules of that grammar as methods (prefixed with `parse_`) that parse an input string according to the rule. The class exposes all rules, so that it's possible to parse both full variant representations as well as components, like so: >>> hp = Parser() >>> v = hp.parse_hgvs_variant("NM_01234.5:c.22+1A>T") >>> v SequenceVariant(ac=NM_01234.5, type=c, posedit=22+1A>T, gene=None) >>> v.posedit.pos BaseOffsetInterval(start=22+1, end=22+1, uncertain=False) >>> i = hp.parse_c_interval("22+1") >>> i BaseOffsetInterval(start=22+1, end=22+1, uncertain=False) The `parse_hgvs_variant` and `parse_c_interval` methods correspond to the `hgvs_variant` and `c_interval rules` in the grammar, respectively. As a convenience, the Parser provides the `parse` method as a shorthand for `parse_hgvs_variant`: >>> v = hp.parse("NM_01234.5:c.22+1A>T") >>> v SequenceVariant(ac=NM_01234.5, type=c, posedit=22+1A>T, gene=None) Because the methods are generated on-the-fly and depend on the grammar that is loaded at runtime, a full list of methods is not available in the documentation. However, the list of rules/methods is available via the `rules` instance variable. A few notable methods are listed below: `parse_hgvs_variant()` parses any valid HGVS string supported by the grammar. >>> hp.parse_hgvs_variant("NM_01234.5:c.22+1A>T") SequenceVariant(ac=NM_01234.5, type=c, posedit=22+1A>T, gene=None) >>> hp.parse_hgvs_variant("NP_012345.6:p.Ala22Trp") SequenceVariant(ac=NP_012345.6, type=p, posedit=Ala22Trp, gene=None) The `hgvs_variant` rule iteratively attempts parsing using the major classes of HGVS variants. For slight improvements in efficiency, those rules may be invoked directly: >>> hp.parse_p_variant("NP_012345.6:p.Ala22Trp") SequenceVariant(ac=NP_012345.6, type=p, posedit=Ala22Trp, gene=None) Similarly, components of the underlying structure may be parsed directly as well: >>> hp.parse_c_posedit("22+1A>T") PosEdit(pos=22+1, edit=A>T, uncertain=False) >>> hp.parse_c_interval("22+1") BaseOffsetInterval(start=22+1, end=22+1, uncertain=False) """ __default_grammar_fn = resource_filename(__name__, "_data/hgvs.pymeta") def __init__(self, grammar_fn=__default_grammar_fn, expose_all_rules=False): self._grammar_fn = grammar_fn self._grammar = parsley.makeGrammar( open(grammar_fn, "r").read(), { "hgvs": hgvs, "bioutils": bioutils, "copy": copy }) self._logger = logging.getLogger(__name__) self._expose_rule_functions(expose_all_rules)
[docs] def parse(self, v): """parse HGVS variant `v`, returning a SequenceVariant :param str v: an HGVS-formatted variant as a string :rtype: SequenceVariant """ return self.parse_hgvs_variant(v)
def _expose_rule_functions(self, expose_all_rules=False): """add parse functions for public grammar rules Defines a function for each public grammar rule, based on introspecting the grammar. For example, the `c_interval` rule is exposed as a method `parse_c_interval` and used like this:: Parser.parse_c_interval('26+2_57-3') -> Interval(...) """ def make_parse_rule_function(rule_name): "builds a wrapper function that parses a string with the specified rule" def rule_fxn(s): try: return self._grammar(s).__getattr__(rule_name)() except ometa.runtime.ParseError as exc: raise HGVSParseError("{s}: char {exc.position}: {reason}".format( s=s, exc=exc, reason=exc.formatReason())) rule_fxn.__doc__ = "parse string s using `%s' rule" % rule_name return rule_fxn exposed_rule_re = re.compile(r"hgvs_(variant|position)|(c|g|m|n|p|r)" r"_(edit|hgvs_position|interval|pos|posedit|variant)") exposed_rules = [ m.replace("rule_", "") for m in dir(self._grammar._grammarClass) if m.startswith("rule_") ] if not expose_all_rules: exposed_rules = [ rule_name for rule_name in exposed_rules if exposed_rule_re.match(rule_name) ] for rule_name in exposed_rules: att_name = "parse_" + rule_name rule_fxn = make_parse_rule_function(rule_name) self.__setattr__(att_name, rule_fxn) self._logger.debug("Exposed {n} rules ({rules})".format( n=len(exposed_rules), rules=", ".join(exposed_rules)))
# <LICENSE> # Copyright 2018 HGVS Contributors (https://github.com/biocommons/hgvs) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # </LICENSE>