# -*- coding: utf-8 -*-
"""Provides VariantMapper and AssemblyMapper to project variants
between sequences using AlignmentMapper.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import copy
import logging
from bioutils.sequences import reverse_complement
import hgvs
import hgvs.location
import hgvs.normalizer
import hgvs.posedit
import hgvs.edit
import hgvs.alignmentmapper
import hgvs.utils.altseq_to_hgvsp as altseq_to_hgvsp
import hgvs.utils.altseqbuilder as altseqbuilder
import hgvs.sequencevariant
import hgvs.validator
from hgvs.exceptions import HGVSUnsupportedOperationError, HGVSInvalidVariantError
from hgvs.decorators.lru_cache import lru_cache
from hgvs.enums import PrevalidationLevel
from hgvs.utils.reftranscriptdata import RefTranscriptData
_logger = logging.getLogger(__name__)
[docs]class VariantMapper(object):
r"""Maps SequenceVariant objects between g., n., r., c., and p. representations.
g⟷{c,n,r} projections are similar in that c, n, and r variants
may use intronic coordinates. There are two essential differences
that distinguish the three types:
* Sequence start: In n and r variants, position 1 is the sequence
start; in c variants, 1 is the transcription start site.
* Alphabet: In n and c variants, sequences are DNA; in
r. variants, sequences are RNA.
This differences are summarized in this diagram::
g ----acgtatgcac--gtctagacgt---- ----acgtatgcac--gtctagacgt---- ----acgtatgcac--gtctagacgt----
\ \/ / \ \/ / \ \/ /
c acgtATGCACGTCTAGacgt n acgtatgcacgtctagacgt r acguaugcacgucuagacgu
1 1 1
p MetHisValTer
The g excerpt and exon structures are identical. The g⟷n
transformation, which is the most basic, accounts for the offset
of the aligned sequences (shown with "1") and the exon structure.
The g⟷c transformation is akin to g⟷n transformation, but
requires an addition offset to account for the translation start
site (c.1). The CDS in uppercase. The g⟷c transformation is
akin to g⟷n transformation with a change of alphabet.
Therefore, this this code uses g⟷n as the core transformation
between genomic and c, n, and r variants: All c⟷g and r⟷g
transformations use n⟷g after accounting for the above
differences. For example, c_to_g accounts for the transcription
start site offset, then calls n_to_g.
All methods require and return objects of type
:class:`hgvs.sequencevariant.SequenceVariant`.
"""
def __init__(self,
hdp,
replace_reference=hgvs.global_config.mapping.replace_reference,
prevalidation_level=hgvs.global_config.mapping.prevalidation_level,
add_gene_symbol=hgvs.global_config.mapping.add_gene_symbol
):
"""
:param bool replace_reference: replace reference (entails additional network access)
:param str prevalidation_level: None or Intrinsic or Extrinsic validation before mapping
"""
self.hdp = hdp
self.replace_reference = replace_reference
self.add_gene_symbol = add_gene_symbol
if prevalidation_level is None:
self.prevalidation_level = PrevalidationLevel.NONE
else:
self.prevalidation_level = PrevalidationLevel[prevalidation_level.upper()]
if self.prevalidation_level == PrevalidationLevel.NONE:
self._validator = None
elif self.prevalidation_level == PrevalidationLevel.INTRINSIC:
self._validator = hgvs.validator.IntrinsicValidator(strict=False)
else:
self._validator = hgvs.validator.Validator(self.hdp, strict=False)
# ############################################################################
# g⟷t
[docs] def g_to_t(self, var_g, tx_ac, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
if not (var_g.type == "g"):
raise HGVSInvalidVariantError("Expected a g. variant; got " + str(var_g))
if self._validator:
self._validator.validate(var_g)
var_g.fill_ref(self.hdp)
mapper = self._fetch_AlignmentMapper(
tx_ac=tx_ac, alt_ac=var_g.ac, alt_aln_method=alt_aln_method)
if mapper.is_coding_transcript:
var_out = VariantMapper.g_to_c(
self, var_g=var_g, tx_ac=tx_ac, alt_aln_method=alt_aln_method)
else:
var_out = VariantMapper.g_to_n(
self, var_g=var_g, tx_ac=tx_ac, alt_aln_method=alt_aln_method)
return var_out
[docs] def t_to_g(self, var_t, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
if var_t.type not in "cn":
raise HGVSInvalidVariantError("Expected a c. or n. variant; got " + str(var_t))
if self._validator:
self._validator.validate(var_t)
var_t.fill_ref(self.hdp)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_t.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
if mapper.is_coding_transcript:
var_out = VariantMapper.c_to_g(
self, var_c=var_t, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
else:
var_out = VariantMapper.n_to_g(
self, var_n=var_t, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
return var_out
# ############################################################################
# g⟷n
[docs] def g_to_n(self, var_g, tx_ac, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""Given a parsed g. variant, return a n. variant on the specified
transcript using the specified alignment method (default is
"splign" from NCBI).
:param hgvs.sequencevariant.SequenceVariant var_g: a variant object
:param str tx_ac: a transcript accession (e.g., NM_012345.6 or ENST012345678)
:param str alt_aln_method: the alignment method; valid values depend on data source
:returns: variant object (:class:`hgvs.sequencevariant.SequenceVariant`) using transcript (n.) coordinates
:raises HGVSInvalidVariantError: if var_g is not of type "g"
"""
if not (var_g.type == "g"):
raise HGVSInvalidVariantError("Expected a g. variant; got " + str(var_g))
if self._validator:
self._validator.validate(var_g)
var_g.fill_ref(self.hdp)
mapper = self._fetch_AlignmentMapper(
tx_ac=tx_ac, alt_ac=var_g.ac, alt_aln_method=alt_aln_method)
pos_n = mapper.g_to_n(var_g.posedit.pos)
if not pos_n.uncertain:
edit_n = self._convert_edit_check_strand(mapper.strand, var_g.posedit.edit)
if edit_n.type == 'ins' and pos_n.start.offset == 0 and pos_n.end.offset == 0 and pos_n.end - pos_n.start > 1:
pos_n.start.base += 1
pos_n.end.base -= 1
edit_n.ref = ''
else:
# variant at alignment gap
pos_g = mapper.n_to_g(pos_n)
edit_n = hgvs.edit.NARefAlt(
ref='', alt=self._get_altered_sequence(mapper.strand, pos_g, var_g))
pos_n.uncertain = var_g.posedit.pos.uncertain
var_n = hgvs.sequencevariant.SequenceVariant(
ac=tx_ac, type="n", posedit=hgvs.posedit.PosEdit(pos_n, edit_n))
if self.replace_reference:
self._replace_reference(var_n)
if self.add_gene_symbol:
self._update_gene_symbol(var_n, var_g.gene)
return var_n
[docs] def n_to_g(self, var_n, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""Given a parsed n. variant, return a g. variant on the specified
transcript using the specified alignment method (default is
"splign" from NCBI).
:param hgvs.sequencevariant.SequenceVariant var_n: a variant object
:param str alt_ac: a reference sequence accession (e.g., NC_000001.11)
:param str alt_aln_method: the alignment method; valid values depend on data source
:returns: variant object (:class:`hgvs.sequencevariant.SequenceVariant`)
:raises HGVSInvalidVariantError: if var_n is not of type "n"
"""
if not (var_n.type == "n"):
raise HGVSInvalidVariantError("Expected a n. variant; got " + str(var_n))
if self._validator:
self._validator.validate(var_n)
var_n.fill_ref(self.hdp)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_n.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
pos_g = mapper.n_to_g(var_n.posedit.pos)
if not pos_g.uncertain:
edit_g = self._convert_edit_check_strand(mapper.strand, var_n.posedit.edit)
if edit_g.type == 'ins' and pos_g.end - pos_g.start > 1:
pos_g.start.base += 1
pos_g.end.base -= 1
edit_g.ref = ''
else:
# variant at alignment gap
pos_n = mapper.g_to_n(pos_g)
edit_g = hgvs.edit.NARefAlt(
ref='', alt=self._get_altered_sequence(mapper.strand, pos_n, var_n))
pos_g.uncertain = var_n.posedit.pos.uncertain
var_g = hgvs.sequencevariant.SequenceVariant(
ac=alt_ac, type="g", posedit=hgvs.posedit.PosEdit(pos_g, edit_g))
if self.replace_reference:
self._replace_reference(var_g)
# No gene symbol for g. variants (actually, *should* for NG, but no way to distinguish)
return var_g
# ############################################################################
# g⟷c
[docs] def g_to_c(self, var_g, tx_ac, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""Given a parsed g. variant, return a c. variant on the specified
transcript using the specified alignment method (default is
"splign" from NCBI).
:param hgvs.sequencevariant.SequenceVariant var_g: a variant object
:param str tx_ac: a transcript accession (e.g., NM_012345.6 or ENST012345678)
:param str alt_aln_method: the alignment method; valid values depend on data source
:returns: variant object (:class:`hgvs.sequencevariant.SequenceVariant`) using CDS coordinates
:raises HGVSInvalidVariantError: if var_g is not of type "g"
"""
if not (var_g.type == "g"):
raise HGVSInvalidVariantError("Expected a g. variant; got " + str(var_g))
if self._validator:
self._validator.validate(var_g)
var_g.fill_ref(self.hdp)
mapper = self._fetch_AlignmentMapper(
tx_ac=tx_ac, alt_ac=var_g.ac, alt_aln_method=alt_aln_method)
pos_c = mapper.g_to_c(var_g.posedit.pos)
if not pos_c.uncertain:
edit_c = self._convert_edit_check_strand(mapper.strand, var_g.posedit.edit)
if edit_c.type == 'ins' and pos_c.start.offset == 0 and pos_c.end.offset == 0 and pos_c.end - pos_c.start > 1:
pos_c.start.base += 1
pos_c.end.base -= 1
edit_c.ref = ''
else:
# variant at alignment gap
pos_g = mapper.c_to_g(pos_c)
edit_c = hgvs.edit.NARefAlt(
ref='', alt=self._get_altered_sequence(mapper.strand, pos_g, var_g))
pos_c.uncertain = var_g.posedit.pos.uncertain
var_c = hgvs.sequencevariant.SequenceVariant(
ac=tx_ac, type="c", posedit=hgvs.posedit.PosEdit(pos_c, edit_c))
if self.replace_reference:
self._replace_reference(var_c)
if self.add_gene_symbol:
self._update_gene_symbol(var_c, var_g.gene)
return var_c
[docs] def c_to_g(self, var_c, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""Given a parsed c. variant, return a g. variant on the specified
transcript using the specified alignment method (default is
"splign" from NCBI).
:param hgvs.sequencevariant.SequenceVariant var_c: a variant object
:param str alt_ac: a reference sequence accession (e.g., NC_000001.11)
:param str alt_aln_method: the alignment method; valid values depend on data source
:returns: variant object (:class:`hgvs.sequencevariant.SequenceVariant`)
:raises HGVSInvalidVariantError: if var_c is not of type "c"
"""
if not (var_c.type == "c"):
raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
var_c.fill_ref(self.hdp)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_c.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
pos_g = mapper.c_to_g(var_c.posedit.pos)
if not pos_g.uncertain:
edit_g = self._convert_edit_check_strand(mapper.strand, var_c.posedit.edit)
if edit_g.type == 'ins' and pos_g.end - pos_g.start > 1:
pos_g.start.base += 1
pos_g.end.base -= 1
edit_g.ref = ''
else:
# variant at alignment gap
var_n = copy.deepcopy(var_c)
var_n.posedit.pos = mapper.c_to_n(var_c.posedit.pos)
var_n.type = 'n'
pos_n = mapper.g_to_n(pos_g)
edit_g = hgvs.edit.NARefAlt(
ref='', alt=self._get_altered_sequence(mapper.strand, pos_n, var_n))
pos_g.uncertain = var_c.posedit.pos.uncertain
var_g = hgvs.sequencevariant.SequenceVariant(
ac=alt_ac, type="g", posedit=hgvs.posedit.PosEdit(pos_g, edit_g))
if self.replace_reference:
self._replace_reference(var_g)
return var_g
# ############################################################################
# c⟷n
[docs] def c_to_n(self, var_c):
"""Given a parsed c. variant, return a n. variant on the specified
transcript using the specified alignment method (default is
"transcript" indicating a self alignment).
:param hgvs.sequencevariant.SequenceVariant var_c: a variant object
:returns: variant object (:class:`hgvs.sequencevariant.SequenceVariant`)
:raises HGVSInvalidVariantError: if var_c is not of type "c"
"""
if not (var_c.type == "c"):
raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
var_c.fill_ref(self.hdp)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_c.ac, alt_ac=var_c.ac, alt_aln_method="transcript")
pos_n = mapper.c_to_n(var_c.posedit.pos)
if (isinstance(var_c.posedit.edit, hgvs.edit.NARefAlt)
or isinstance(var_c.posedit.edit, hgvs.edit.Dup)
or isinstance(var_c.posedit.edit, hgvs.edit.Inv)):
edit_n = copy.deepcopy(var_c.posedit.edit)
else:
raise HGVSUnsupportedOperationError(
"Only NARefAlt/Dup/Inv types are currently implemented")
var_n = hgvs.sequencevariant.SequenceVariant(
ac=var_c.ac, type="n", posedit=hgvs.posedit.PosEdit(pos_n, edit_n))
if self.replace_reference:
self._replace_reference(var_n)
if self.add_gene_symbol:
self._update_gene_symbol(var_n, var_c.gene)
return var_n
[docs] def n_to_c(self, var_n):
"""Given a parsed n. variant, return a c. variant on the specified
transcript using the specified alignment method (default is
"transcript" indicating a self alignment).
:param hgvs.sequencevariant.SequenceVariant var_n: a variant object
:returns: variant object (:class:`hgvs.sequencevariant.SequenceVariant`)
:raises HGVSInvalidVariantError: if var_n is not of type "n"
"""
if not (var_n.type == "n"):
raise HGVSInvalidVariantError("Expected n. variant; got " + str(var_n))
if self._validator:
self._validator.validate(var_n)
var_n.fill_ref(self.hdp)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_n.ac, alt_ac=var_n.ac, alt_aln_method="transcript")
pos_c = mapper.n_to_c(var_n.posedit.pos)
if (isinstance(var_n.posedit.edit, hgvs.edit.NARefAlt)
or isinstance(var_n.posedit.edit, hgvs.edit.Dup)
or isinstance(var_n.posedit.edit, hgvs.edit.Inv)):
edit_c = copy.deepcopy(var_n.posedit.edit)
else:
raise HGVSUnsupportedOperationError(
"Only NARefAlt/Dup/Inv types are currently implemented")
var_c = hgvs.sequencevariant.SequenceVariant(
ac=var_n.ac, type="c", posedit=hgvs.posedit.PosEdit(pos_c, edit_c))
if self.replace_reference:
self._replace_reference(var_c)
if self.add_gene_symbol:
self._update_gene_symbol(var_c, var_n.gene)
return var_c
# ############################################################################
# c ⟶ p
[docs] def c_to_p(self, var_c, pro_ac=None):
"""
Converts a c. SequenceVariant to a p. SequenceVariant on the specified protein accession
Author: Rudy Rico
:param SequenceVariant var_c: hgvsc tag
:param str pro_ac: protein accession
:rtype: hgvs.sequencevariant.SequenceVariant
"""
if not (var_c.type == "c"):
raise HGVSInvalidVariantError("Expected a cDNA (c.) variant; got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
reference_data = RefTranscriptData(self.hdp, var_c.ac, pro_ac)
builder = altseqbuilder.AltSeqBuilder(var_c, reference_data)
# TODO: handle case where you get 2+ alt sequences back;
# currently get list of 1 element loop structure implemented
# to handle this, but doesn't really do anything currently.
all_alt_data = builder.build_altseq()
var_ps = []
for alt_data in all_alt_data:
builder = altseq_to_hgvsp.AltSeqToHgvsp(reference_data, alt_data)
var_p = builder.build_hgvsp()
var_ps.append(var_p)
var_p = var_ps[0]
if self.add_gene_symbol:
self._update_gene_symbol(var_p, var_c.gene)
return var_p
############################################################################
# Internal methods
def _replace_reference(self, var):
"""fetch reference sequence for variant and update (in-place) if necessary"""
if var.type not in "cgmnr":
raise HGVSUnsupportedOperationError("Can only update references for type c, g, m, n, r")
if var.posedit.edit.type == "ins":
# insertions have no reference sequence (zero-width), so return as-is
return var
if var.posedit.edit.type == "con":
# conversions have no reference sequence (zero-width), so return as-is
return var
pos = var.posedit.pos
if ((isinstance(pos.start, hgvs.location.BaseOffsetPosition) and pos.start.offset != 0)
or (isinstance(pos.end, hgvs.location.BaseOffsetPosition) and pos.end.offset != 0)):
_logger.info("Can't update reference sequence for intronic variant {}".format(var))
return var
# For c. variants, we need coords on underlying sequences
if var.type == "c":
mapper = self._fetch_AlignmentMapper(
tx_ac=var.ac, alt_ac=var.ac, alt_aln_method="transcript")
pos = mapper.c_to_n(var.posedit.pos)
else:
pos = var.posedit.pos
seq = self.hdp.get_seq(var.ac, pos.start.base - 1, pos.end.base)
edit = var.posedit.edit
if edit.ref != seq:
_logger.debug("Replaced reference sequence in {var} with {seq}".format(
var=var, seq=seq))
edit.ref = seq
return var
@lru_cache(maxsize=hgvs.global_config.lru_cache.maxsize)
def _fetch_AlignmentMapper(self, tx_ac, alt_ac, alt_aln_method):
"""
Get a new AlignmentMapper for the given transcript accession (ac),
possibly caching the result.
"""
return hgvs.alignmentmapper.AlignmentMapper(
self.hdp, tx_ac=tx_ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
@staticmethod
def _convert_edit_check_strand(strand, edit_in):
"""
Convert an edit from one type to another, based on the stand and type
"""
if isinstance(edit_in, hgvs.edit.NARefAlt):
if strand == 1:
edit_out = copy.deepcopy(edit_in)
else:
try:
# if smells like an int, do nothing
# TODO: should use ref_n, right?
int(edit_in.ref)
ref = edit_in.ref
except (ValueError, TypeError):
ref = reverse_complement(edit_in.ref)
edit_out = hgvs.edit.NARefAlt(
ref=ref,
alt=reverse_complement(edit_in.alt),
)
elif isinstance(edit_in, hgvs.edit.Dup):
if strand == 1:
edit_out = copy.deepcopy(edit_in)
else:
edit_out = hgvs.edit.Dup(ref=reverse_complement(edit_in.ref))
elif isinstance(edit_in, hgvs.edit.Inv):
if strand == 1:
edit_out = copy.deepcopy(edit_in)
else:
try:
int(edit_in.ref)
ref = edit_in.ref
except (ValueError, TypeError):
ref = reverse_complement(edit_in.ref)
edit_out = hgvs.edit.Inv(ref=ref)
else:
raise NotImplementedError("Only NARefAlt/Dup/Inv types are currently implemented")
return edit_out
def _get_altered_sequence(self, strand, interval, var):
seq = list(self.hdp.get_seq(var.ac, interval.start.base - 1, interval.end.base))
# positions are 0-based and half-open
pos_start = var.posedit.pos.start.base - interval.start.base
pos_end = var.posedit.pos.end.base - interval.start.base + 1
edit = var.posedit.edit
if edit.type == 'sub':
seq[pos_start] = edit.alt
elif edit.type == 'del':
del seq[pos_start:pos_end]
elif edit.type == 'ins':
seq.insert(pos_start + 1, edit.alt)
elif edit.type == 'delins':
del seq[pos_start:pos_end]
seq.insert(pos_start, edit.alt)
elif edit.type == 'dup':
seq.insert(pos_end, ''.join(seq[pos_start:pos_end]))
elif edit.type == 'inv':
seq[pos_start:pos_end] = list(reverse_complement(''.join(seq[pos_start:pos_end])))
elif edit.type == 'identity':
pass
else:
raise HGVSUnsupportedOperationError(
"Getting altered sequence for {type} is unsupported".format(type=edit.type))
seq = ''.join(seq)
if strand == -1:
seq = reverse_complement(seq)
return seq
def _update_gene_symbol(self, var, symbol):
if not symbol:
symbol = self.hdp.get_tx_identity_info(var.ac).get("hgnc", None)
var.gene = symbol
return var
# <LICENSE>
# Copyright 2018 HGVS Contributors (https://github.com/biocommons/hgvs)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </LICENSE>