Source code for MyCapytain.common.reference

# -*- coding: utf-8 -*-
"""
.. module:: MyCapytain.common.reference
   :synopsis: URN related objects

.. moduleauthor:: Thibault Clérice <leponteineptique@gmail.com>

>>> from MyCapytain.common.reference import URN, Reference, Citation

"""
from __future__ import unicode_literals

from collections import defaultdict
from past.builtins import basestring
from six import text_type as str
from builtins import \
    range, object
from copy import copy
import re


REFSDECL_SPLITTER = re.compile("/+[\*()|\sa-zA-Z0-9:\[\]@=\\\{\$'\"\.\s]+")
REFSDECL_REPLACER = re.compile("\$[0-9]+")
SUBREFERENCE = re.compile("(\w*)\[{0,1}([0-9]*)\]{0,1}", re.UNICODE)
REFERENCE_REPLACER = re.compile("(@[a-zA-Z0-9:]+){1}(=){1}([\\\$'\"?0-9]{3,6})")


[docs]class Reference(object): """ A reference object giving informations :param reference: Passage Reference part of a Urn :type reference: basestring :ivar parent: Parent Reference :type parent: Reference :ivar highest: List representation of the range member which is the highest in the hierarchy (If equal, start is returned) :type highest: Reference :ivar start: First part of the range :type start: Reference :ivar end: Second part of the range :type end: Reference :ivar list: List representation of the range. Not available for range :type list: list :ivar subreference: Word and Word counter ("Achiles", 1) representing the subreference. Not available for range :type subreference: (str, int) :Example: >>> a = Reference(reference="1.1@Achiles[1]-1.2@Zeus[1]") >>> b = Reference(reference="1.1") >>> Reference("1.1-2.2.2").highest == ["1", "1"] Reference object supports the following magic methods : len(), str() and eq(). :Example: >>> len(a) == 2 && len(b) == 1 >>> str(a) == "1.1@Achiles[1]-1.2@Zeus[1]" >>> b == Reference("1.1") && b != a .. note:: While Reference(...).subreference and .list are not available for range, Reference(..).start.subreference and Reference(..).end.subreference as well as .list are available """ def __init__(self, reference=""): self.reference = reference if reference == "": self.parsed = (self.__model(), self.__model()) else: self.parsed = self.__parse(reference) @property def parent(self): """ Parent of the actual URN, for example, 1.1 for 1.1.1 :rtype: Reference """ if len(self.parsed[0][1]) == 1 and len(self.parsed[1][1]) <= 1: return None else: if len(self.parsed[0][1]) > 1 and len(self.parsed[1][1]) == 0: return Reference("{0}{1}".format( ".".join(list(self.parsed[0][1])[0:-1]), self.parsed[0][3] or "" )) elif len(self.parsed[0][1]) > 1 and len(self.parsed[1][1]) > 1: first = list(self.parsed[0][1])[0:-1] last = list(self.parsed[1][1])[0:-1] if first == last and self.parsed[1][3] is None \ and self.parsed[0][3] is None: return Reference(".".join(first)) else: return Reference("{0}{1}-{2}{3}".format( ".".join(first), self.parsed[0][3] or "", ".".join(list(self.parsed[1][1])[0:-1]), self.parsed[1][3] or "" )) @property def highest(self): """ Return highest reference level For references such as 1.1-1.2.8, with different level, it can be useful to access to the highest node in the hierarchy. In this case, the highest level would be 1.1. The function would return ["1", "1"] .. note:: By default, this property returns the start level :rtype: Reference """ if not self.end: return self elif len(self.start) < len(self.end) and len(self.start): return self.start elif len(self.start) > len(self.end) and len(self.end): return self.end elif len(self.start): return self.start return self @property def start(self): """ Quick access property for start list """ if self.parsed[0][0] and len(self.parsed[0][0]): return Reference(self.parsed[0][0]) @property def end(self): """ Quick access property for reference end list """ if self.parsed[1][0] and len(self.parsed[1][0]): return Reference(self.parsed[1][0]) @property def list(self): """ Return a list version of the object if it is a single passage .. note:: Access to start list and end list should be done through obj.start.list and obj.end.list :rtype: [str] """ if not self.end: return self.parsed[0][1] @property def subreference(self): """ Return the subreference of a single node reference .. note:: Access to start and end subreference should be done through obj.start.subreference and obj.end.subreference :rtype: (str, int) """ if not self.end: return Reference.convert_subreference(*self.parsed[0][2]) def __len__(self): """ Return depth of highest reference level For references such as 1.1-1.2.8, or simple references such as 1.a, with different level, it can be useful to know the depth of the reference to access the right XPath for example. This property returns the depth of the highest node :example: - len(1.1) == 2 - len(1.2.8-1.3) == 2 - len(1-1.2) == 1 :rtype: int """ return len(self.highest.list) def __str__(self): """ Return full reference in string format :rtype: basestring :returns: String representation of Reference Object :Example: >>> a = Reference(reference="1.1@Achiles[1]-1.2@Zeus[1]") >>> b = Reference(reference="1.1") >>> str(a) == "1.1@Achiles[1]-1.2@Zeus[1]" >>> str(b) == "1.1" """ return self.reference def __eq__(self, other): """ Equality checker for Reference object :param other: An object to be checked against :rtype: boolean :returns: Equality between other and self :Example: >>> a = Reference(reference="1.1@Achiles[1]-1.2@Zeus[1]") >>> b = Reference(reference="1.1") >>> c = Reference(reference="1.1") >>> (a == b) == False >>> (c == b) == True """ return (isinstance(other, self.__class__) and self.reference == str(other)) def __model(self): """ 3-Tuple model for references First element is full text reference, Second is list of passage identifiers Third is subreference :returns: An empty list to model data :rtype: list """ return [None, [], None, None] def __regexp(self, subreference): """ Split components of subreference :param subreference: A subreference :type subreference: basestring :rtype: List.<Tuple> :returns: List where first element is a tuple representing different components """ return SUBREFERENCE.findall(subreference)[0] def __parse(self, reference): """ Parse references informations """ ref = reference.split("-") element = [self.__model(), self.__model()] for i in range(0, len(ref)): r = ref[i] element[i][0] = r subreference = r.split("@") if len(subreference) == 2: element[i][2] = self.__regexp(subreference[1]) element[i][3] = "@" + subreference[1] r = subreference[0] element[i][1] = r.split(".") element[i] = tuple(element[i]) return tuple(element) @staticmethod def convert_subreference(word, counter): if len(counter) and word: return str(word), int(counter) elif len(counter) == 0 and word: return str(word), 0 else: return "", 0
[docs]class URN(object): """ A URN object giving all useful sections :param urn: A CTS URN :type urn: str :ivar urn_namespace: Namespace of the URN :type urn_namespace: str :ivar namespace: CTS Namespace :type namespace: str :ivar textgroup: CTS Textgroup :type textgroup: str :ivar work: CTS Work :type work: str :ivar version: CTS Version :type version: str :ivar reference: CTS Reference :type reference: Reference :cvar NAMESPACE: Constant representing the URN until its namespace :cvar TEXTGROUP: Constant representing the URN until its textgroup :cvar WORK: Constant representing the URN until its work :cvar VERSION: Constant representing the URN until its version :cvar PASSAGE: Constant representing the URN until its full passage :cvar PASSAGE_START: Constant representing the URN until its passage (end excluded) :cvar PASSAGE_END: Constant representing the URN until its passage (start excluded) :cvar NO_PASSAGE: Constant representing the URN until its passage excluding its passage :cvar COMPLETE: Constant representing the complete URN :Example: >>> a = URN(urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2:1.1") URN object supports the following magic methods : len(), str() and eq(), gt() and lt(). :Example: >>> b = URN("urn:cts:latinLit:phi1294.phi002") >>> a != b >>> a > b # It has more member. Only member count is compared >>> b < a >>> len(a) == 5 # Reference is not counted to not induce count equivalencies with the optional version >>> len(b) == 4 .. exclude-members:: all .. automethod:: upTo """ NAMESPACE = 0 TEXTGROUP = 1 WORK = 2 VERSION = 3 PASSAGE = 4 PASSAGE_START = 5 PASSAGE_END = 6 NO_PASSAGE = 10 COMPLETE = 100 def __init__(self, urn): self.__urn = None self.__parsed = self.__parse(urn) @property def urn_namespace(self): return self.__parsed["urn_namespace"] @urn_namespace.setter def urn_namespace(self, value): self.__urn = None self.__parsed["urn_namespace"] = value @property def namespace(self): return self.__parsed["cts_namespace"] @namespace.setter def namespace(self, value): self.__urn = None self.__parsed["cts_namespace"] = value @property def textgroup(self): return self.__parsed["textgroup"] @textgroup.setter def textgroup(self, value): self.__urn = None self.__parsed["textgroup"] = value @property def work(self): return self.__parsed["work"] @work.setter def work(self, value): self.__urn = None self.__parsed["work"] = value @property def version(self): return self.__parsed["version"] @version.setter def version(self, value): self.__urn = None self.__parsed["version"] = value @property def reference(self): return self.__parsed["reference"] @reference.setter def reference(self, value): self.__urn = None if isinstance(value, Reference): self.__parsed["reference"] = value else: self.__parsed["reference"] = Reference(value) def __len__(self): """ Returns the len of the URN :rtype: int :returns: Length of the URN .. warning:: Does not take into account the passage ! :Example: >>> a = URN(urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2:1.1") >>> print(len(a)) """ items = [ key for key, value in self.__parsed.items() if key not in ["reference"] and value is not None ] return len(items) def __gt__(self, other): """ Allows for greater comparison :param other: Comparison object :type other: URN :rtype: boolean :returns: Indicator of bigger size .. warning:: Does not take into account the passage ! :Example: >>> a = URN(urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2:1.1") >>> b = URN(urn="urn:cts:latinLit:phi1294.phi002:1.1") >>> (a > b) == True # """ return len(self) > len(other) def __lt__(self, other): """ Allows for lower comparison :param other: Comparison object :type other: URN :rtype: boolean :returns: Indicator of lower size .. warning:: Does not take into account the passage ! :Example: >>> a = URN(urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2:1.1") >>> b = URN(urn="urn:cts:latinLit:phi1294.phi002:1.1") >>> (b < a) == True # """ return len(self) < len(other) def __eq__(self, other): """ Equality checker for URN object :param other: An object to be checked against :type other: URN :rtype: boolean :returns: Equality between other and self :Example: >>> a = URN(urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2:1.1") >>> b = URN(urn="urn:cts:latinLit:phi1294.phi002:1.1") >>> (b == a) == False # """ return (isinstance(other, self.__class__) and self.__str__() == str(other)) def __str__(self): """ Return full initial urn :rtype: basestring :returns: String representation of URN Object :Example: >>> a = URN(urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2:1.1") >>> str(a) == "urn:cts:latinLit:phi1294.phi002.perseus-lat2:1.1" """ if self.__urn is None: urn = "urn:" + self.__parsed["urn_namespace"] if self.namespace: urn += ":" + self.namespace if self.textgroup: urn += ":" + self.textgroup if self.work: urn += "." + self.work if self.version: urn += "." + self.version if self.reference: urn += ":" + str(self.reference) self.__urn = urn return self.__urn def upTo(self, key): """ Returns the urn up to given level using URN Constants :param key: Identifier of the wished resource using URN constants :type key: int :returns: String representation of the partial URN requested :rtype: str :Example: >>> a = URN(urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2:1.1") >>> a.upTo(URN.TEXTGROUP) == "urn:cts:latinLit:phi1294" """ middle = [ component for component in [self.__parsed["textgroup"], self.__parsed["work"], self.__parsed["version"]] if component is not None ] if key == URN.COMPLETE: return self.__str__() elif key == URN.NAMESPACE: return ":".join([ "urn", self.__parsed["urn_namespace"], self.__parsed["cts_namespace"]]) elif key == URN.TEXTGROUP and self.__parsed["textgroup"]: return ":".join([ "urn", self.__parsed["urn_namespace"], self.__parsed["cts_namespace"], self.__parsed["textgroup"] ]) elif key == URN.WORK and self.__parsed["work"]: return ":".join([ "urn", self.__parsed["urn_namespace"], self.__parsed["cts_namespace"], ".".join([self.__parsed["textgroup"], self.__parsed["work"]]) ]) elif key == URN.VERSION and self.__parsed["version"]: return ":".join([ "urn", self.__parsed["urn_namespace"], self.__parsed["cts_namespace"], ".".join(middle) ]) elif key == URN.NO_PASSAGE and self.__parsed["work"]: return ":".join([ "urn", self.__parsed["urn_namespace"], self.__parsed["cts_namespace"], ".".join(middle) ]) elif key == URN.PASSAGE and self.__parsed["reference"]: return ":".join([ "urn", self.__parsed["urn_namespace"], self.__parsed["cts_namespace"], ".".join(middle), str(self.reference) ]) elif key == URN.PASSAGE_START and self.__parsed["reference"]: return ":".join([ "urn", self.__parsed["urn_namespace"], self.__parsed["cts_namespace"], ".".join(middle), str(self.reference.start) ]) elif key == URN.PASSAGE_END and self.__parsed["reference"] and self.reference.end is not None: return ":".join([ "urn", self.__parsed["urn_namespace"], self.__parsed["cts_namespace"], ".".join(middle), str(self.reference.end) ]) else: raise KeyError("Provided key is not recognized.") @staticmethod def model(): return { "urn_namespace": None, "cts_namespace": None, "textgroup": None, "work": None, "version": None, "reference": None } def __parse(self, urn): """ Parse a URN :param urn: A URN:CTS :type urn: basestring :rtype: defaultdict.basestring :returns: Dictionary representation """ parsed = URN.model() self.__urn = urn.split("#")[0] urn = self.__urn.split(":") if isinstance(urn, list) and len(urn) > 2: parsed["urn_namespace"] = urn[1] parsed["cts_namespace"] = urn[2] if len(urn) == 5: parsed["reference"] = Reference(urn[4]) if len(urn) >= 4: urn = urn[3].split(".") if len(urn) >= 1: parsed["textgroup"] = urn[0] if len(urn) >= 2: parsed["work"] = urn[1] if len(urn) >= 3: parsed["version"] = urn[2] else: raise ValueError("URN is empty") return parsed
[docs]class Citation(object): """ A citation object gives informations about the scheme :param name: Name of the citation (e.g. "book") :type name: basestring :param xpath: Xpath of the citation (As described by CTS norm) :type xpath: basestring :param scope: Scope of the citation (As described by CTS norm) :type xpath: basestring :param refsDecl: refsDecl version :type refsDecl: basestring :param child: A citation :type child: Citation :ivar name: Name of the citation (e.g. "book") :type name: basestring :ivar xpath: Xpath of the citation (As described by CTS norm) :type xpath: basestring :ivar scope: Scope of the citation (As described by CTS norm) :type xpath: basestring :ivar refsDecl: refsDecl version :type refsDecl: basestring :ivar child: A citation :type child: Citation """ def __init__(self, name=None, xpath=None, scope=None, refsDecl=None, child=None): """ Initialize a Citation object """ self.__name = None self.__xpath = None self.__scope = None self.__refsDecl = None self.__child = None self.name = name self.scope = scope self.xpath = xpath self.refsDecl = refsDecl if child is not None: self.child = child @property def name(self): """ Type of the citation represented :type: basestring :Example: Book, Chapter, Textpart, Section, Poem... """ return self.__name @name.setter def name(self, val): self.__name = val @property def xpath(self): """ TextInventory xpath property of a citation (ie. identifier of the last element of the citation) :type: basestring :Example: //tei:l[@n="?"] """ return self.__xpath @xpath.setter def xpath(self, val): if val is not None: self.__xpath = val self.__upRefsDecl() @property def scope(self): """ TextInventory scope property of a citation (ie. identifier of all element but the last of the citation) :type: basestring :Example: /tei:TEI/tei:text/tei:body/tei:div """ return self.__scope @scope.setter def scope(self, val): if val is not None: self.__scope = val self.__upRefsDecl() @property def refsDecl(self): """ ResfDecl expression of the citation scheme :type: basestring :Example: /tei:TEI/tei:text/tei:body/tei:div//tei:l[@n='$1'] """ return self.__refsDecl @refsDecl.setter def refsDecl(self, val): if val is not None: self.__refsDecl = val self.__upXpathScope() @property def child(self): """ Child of a citation :type: Citation or None :Example: Citation.name==poem would have a child Citation.name==line """ return self.__child @child.setter def child(self, val): if isinstance(val, self.__class__): self.__child = val def __upXpathScope(self): """ Update xpath and scope property when refsDecl is updated """ rd = self.__refsDecl matches = REFSDECL_SPLITTER.findall(rd) self.__scope = REFSDECL_REPLACER.sub("?", "".join(matches[0:-1])) self.__xpath = REFSDECL_REPLACER.sub("?", matches[-1]) def __upRefsDecl(self): """ Update xpath and scope property when refsDecl is updated """ if self.__scope is not None and self.__xpath is not None: xpath = self.__scope + self.__xpath i = xpath.find("?") ii = 1 while i >= 0: xpath = xpath[:i] + "$" + str(ii) + xpath[i+1:] i = xpath.find("?") ii += 1 self.__refsDecl = xpath
[docs] def __iter__(self): """ Iteration method Loop over the citation childs :Example: >>> c = Citation(name="line") >>> b = Citation(name="poem", child=c) >>> a = Citation(name="book", child=b) >>> [e for e in a] == [a, b, c] """ e = self while e is not None: yield e if hasattr(e, "child") and e.child is not None: e = e.child else: break
[docs] def __len__(self): """ Length method :rtype: int :returns: Number of nested citations """ return len([item for item in self])
[docs] def fill(self, passage=None, xpath=None): """ Fill the xpath with given informations :param passage: Passage reference :type passage: Reference or list or None. Can be list of None and not None :param xpath: If set to True, will return the replaced self.xpath value and not the whole self.refsDecl :type xpath: Boolean :rtype: basestring :returns: Xpath to find the passage .. code-block:: python citation = Citation(name="line", scope="/TEI/text/body/div/div[@n=\"?\"]",xpath="//l[@n=\"?\"]") print(citation.fill(["1", None])) # /TEI/text/body/div/div[@n='1']//l[@n] print(citation.fill(None)) # /TEI/text/body/div/div[@n]//l[@n] print(citation.fill(Reference("1.1")) # /TEI/text/body/div/div[@n='1']//l[@n='1'] print(citation.fill("1", xpath=True) # //l[@n='1'] """ if xpath is True: # Then passage is a string or None xpath = self.xpath if passage is None: replacement = r"\1" elif isinstance(passage, basestring): replacement = r"\1\2'" + passage + "'" return REFERENCE_REPLACER.sub(replacement, xpath) else: if isinstance(passage, Reference): passage = passage.list or passage.start.list elif passage is None: return REFERENCE_REPLACER.sub( r"\1", self.refsDecl ) passage = iter(passage) return REFERENCE_REPLACER.sub( lambda m: REF_REPLACER(m, passage), self.refsDecl )
def __getstate__(self): """ Pickling method :return: """ return copy(self.__dict__) def __setstate__(self, dic): self.__dict__ = dic return self
def REF_REPLACER(match, passage): """ Helper to replace xpath/scope/refsDecl on iteration with passage value :param match: A RegExp match :type match: re.SRE_MATCH :param passage: A list with subreference informations :type passage: iter :rtype: basestring :return: Replaced string """ groups = match.groups() ref = next(passage) if ref is None: return groups[0] else: return "{1}='{0}'".format(ref, groups[0])