Add annotations

src-d · Mar 7, 2019 · 0e55837 · 0e55837
1 parent 5dfad33
commit 0e55837
Show file tree

Hide file tree

Showing 9 changed files with 501 additions and 74 deletions.
diff --git a/lookout/style/format/annotations/__init__.py b/lookout/style/format/annotations/__init__.py
@@ -0,0 +1,7 @@
+"""
+Annotation tool.
+
+Inspired by https://uima.apache.org/d/uimafit-current/api/
+"""
+
+# TODO(zurk) move annotation module and tests to lookout-sdk-ml
diff --git a/lookout/style/format/annotations/annotated_data.py b/lookout/style/format/annotations/annotated_data.py
@@ -0,0 +1,201 @@
+from typing import Dict, Iterable, Iterator, Optional, Sequence, Tuple, Union, Any, Type  # noqa F401
+
+from lookout.sdk.service_data_pb2 import File
+from sortedcontainers import SortedDict
+
+from lookout.style.format.annotations.annotations import Annotation, LanguageAnnotation, \
+    PathAnnotation, UASTAnnotation
+
+
+class NoIntersection(Exception):
+    """Raises by AnnotatedData.find_intersect() if there is no intersection."""
+
+
+class AnnotationsSlice(dict):
+    """
+    Annotations collection for a specific range.
+    """
+
+    def __init__(self, start, stop, *args, **kwargs):
+        """Init."""
+        super().__init__(*args, **kwargs)
+        self._range = (start, stop)
+        self._start = start
+        self._stop = stop
+
+    start = property(lambda self: self._start)
+
+    stop = property(lambda self: self._stop)
+
+    range = property(lambda self: self._range)
+
+
+class AnnotatedData:
+    """
+    Class that couples annotations and data together.
+
+    All special utilities to work with annotations should be implemented in this class
+    List of methods that should be implemented can be found here:
+    https://uima.apache.org/d/uimafit-current/api/org/apache/uima/fit/util/JCasUtil.html
+    """
+
+    def __init__(self, content: str):
+        """
+        Init.
+
+        :param content: Data to annotate. It is expected to be string but actually can be any type
+        with __getitem__() defined for int and slice input arguments.
+        """
+        self._content = content
+
+        self._range_to_annotations = SortedDict()  # type: SortedDict[(int, int), Dict[Type[Annotation], Annotation]]  # noqa E501
+        self._type_to_annotations = {}  # type: Dict[Type[Annotation], SortedDict[(int, int), Annotation]]  # noqa E501
+
+    content = property(lambda self: self._content)
+
+    def __getitem__(self, item: Union[int, slice, Tuple[int, int]]) -> Any:
+        if isinstance(item, tuple):
+            item = slice(*item)
+        if isinstance(item, slice) and item.step is not None:
+            raise KeyError("slice.step is not supported.")
+        return self._content[item]
+
+    def add(self, annotation: Annotation) -> None:
+        """
+        Add annotation.
+        """
+        annotation_id = type(annotation)
+        # TODO(zurk): Add a check that there is no overlapping annotations of one type.
+        if annotation.range not in self._range_to_annotations:
+            self._range_to_annotations[annotation.range] = {}
+        if annotation_id not in self._type_to_annotations:
+            self._type_to_annotations[annotation_id] = SortedDict()
+        self._range_to_annotations[annotation.range][annotation_id] = annotation
+        self._type_to_annotations[annotation_id][annotation.range] = annotation
+
+    def update(self, annotations: Iterable[Annotation]) -> None:
+        """
+        Update with annotations.
+        """
+        for annotation in annotations:
+            self.add(annotation)
+
+    def iget(self, annotation_type: Type[Annotation], index: int) -> Annotation:
+        """
+        Return an annotation and for given type and index.
+        """
+        return self._type_to_annotations[annotation_type].peekitem(index)[1]
+
+    def iter_annotation(self, name: str, start_offset: Optional[int] = None,
+                        stop_offset: Optional[int] = None) -> Iterator[Annotation]:
+        """
+        Iterate through specific annotation atomic_tokens, ys, files, etc.
+
+        Returns slice of RawData and its annotation.
+        """
+        if start_offset is not None or stop_offset is not None:
+            raise NotImplementedError()
+
+        for value in self._type_to_annotations[name].values():
+            yield value
+
+    def iter_annotations(self, types: Sequence[Type[Annotation]],
+                         start_offset: Optional[int] = None, stop_offset: Optional[int] = None,
+                         ) -> Iterator[AnnotationsSlice]:
+        """
+        Iterate through annotations with specified type.
+
+        :return: Requested annotations slice.
+        """
+        if start_offset is not None or stop_offset is not None:
+            raise NotImplementedError()
+
+        types_set = frozenset(types)
+        for annotation0 in self.iter_annotation(types[0]):
+            # Annotations with the same range
+            same_range_annotations = self._range_to_annotations[annotation0.range]
+            same_range_names = set(same_range_annotations.keys())
+            common = types_set & same_range_names
+            missing = types_set - same_range_names
+            annotations = dict()
+            for type in missing:
+                try:
+                    annotations[type] = self.find_intersect(type, *annotation0.range)
+                except NoIntersection:
+                    pass
+            annotations.update({type: same_range_annotations[type] for type in common})
+            yield AnnotationsSlice(*annotation0.range, annotations)
+
+    def iter_items(self, types: Sequence[Type[Annotation]], start_offset: Optional[int] = None,
+                   stop_offset: Optional[int] = None,
+                   ) -> Iterator[Tuple[str, AnnotationsSlice]]:
+        """
+        Iterate through annotations with specified type.
+
+        :return: Annotated data slice with requested annotations.
+        """
+        for annotations in self.iter_annotations(types, start_offset, stop_offset):
+            yield self[annotations.range], annotations
+
+    def find_intersect(self, name: str, start: int, stop: int) -> Annotation:
+        """
+        Find an annotation of given type that intersects the interval [start, stop).
+
+        raises NoIntersection exception if there is no such annotation.
+
+        :param name: Annotation type.
+        :param start: start of interval.
+        :param stop: end of interval.
+        :return: requested Annotation.
+        """
+        try:
+            annotation_layer = self._type_to_annotations[name]
+        except KeyError:
+            raise NoIntersection("There is no annotation layer %s" % name)
+        search_start = max(0, annotation_layer.bisect_left((start, start)) - 1)
+        search_stop = annotation_layer.bisect_right((stop, stop))
+        for range in annotation_layer.islice(search_start, search_stop):
+            if self._check_interval_crossing(start, stop, *range):
+                # assuming that there is only one such annotation
+                return annotation_layer[range]
+        raise NoIntersection("There is no annotation %s from %d to %d" % (name, start, stop))
+
+    @classmethod
+    def _check_interval_crossing(cls, start1: int, stop1: int, start2: int, stop2: int) -> bool:
+        # TODO(zurk): explain logic with [x, x) intervals.
+        if start1 == stop1:
+            if start2 == stop2:
+                return start1 == start2
+            else:
+                return start2 < start1 < stop2
+        else:
+            if start2 == stop2:
+                return start1 < start2 < stop1
+            else:
+                return (start1 <= start2 < stop1 or
+                        start1 < stop2 < stop1 or
+                        start2 <= start1 < stop2)
+
+    def subiter_annotation(self, name: str, covering_annotation: Annotation):
+        """TODO."""
+        raise NotImplementedError()
+
+    def sub_iter_annotations(self, names: Sequence[str], covering_annotation: Annotation):
+        """TODO."""
+        raise NotImplementedError()
+
+    @classmethod
+    def from_file(cls, file: File) -> "AnnotatedData":
+        """
+        Create AnnotatedData instance from File.
+
+        :param file: file.content will be used as data to be annotated with \
+                     file.path, file.language and file.uast.
+        :return: new AnnotatedData instance.
+        """
+        raw_data = file.content.decode("utf-8", "replace")
+        annotated_data = AnnotatedData(raw_data)
+        annotated_data.add(PathAnnotation(0, len(raw_data), file.path))
+        annotated_data.add(UASTAnnotation(0, len(raw_data), file.uast))
+        annotated_data.add(LanguageAnnotation(0, len(raw_data), file.language))
+        return annotated_data
diff --git a/lookout/style/format/annotations/annotations.py b/lookout/style/format/annotations/annotations.py
@@ -0,0 +1,138 @@
+"""Annotations for style-analyzer."""
+from typing import Optional, Tuple
+
+
+class Annotation:
+    """Base class for annotation."""
+
+    def __init__(self, start: int, stop: int):
+        """
+        Initialization.
+
+        :param start: Annotation start.
+        :param stop: Annotation end.
+        """
+        self._range = (start, stop)
+        self._start = start
+        self._stop = stop
+
+    start = property(lambda self: self._start)
+
+    stop = property(lambda self: self._stop)
+
+    range = property(lambda self: self._range)
+
+    name = property(lambda self: type(self).__name__)
+
+    def __repr__(self):
+        return self.__str__()
+
+    def __str__(self):
+        return "%s[%d, %d)" % (self.name, self.start, self.stop)
+
+
+class AtomicTokenAnnotation(Annotation):
+    """Infrangible сode token annotation."""
+
+
+class LineAnnotation(Annotation):
+    """Line number annotation."""
+
+    def __init__(self, start: int, stop: int, number: int):
+        """Init."""
+        super().__init__(start, stop)
+        self._number = number
+
+    number = property(lambda self: self._number)
+
+
+class UASTNodeAnnotation(Annotation):
+    """UAST Node annotation."""
+
+    def __init__(self, start: int, stop: int, node: "bblfsh.Node"):
+        """Init."""
+        super().__init__(start, stop)
+        self._node = node
+
+    node = property(lambda self: self._node)
+
+    @staticmethod
+    def from_node(node: "bblfsh.Node") -> "UASTNodeAnnotation":
+        """Create the annotation from bblfsh node."""
+        return UASTNodeAnnotation(node.start_position.offset, node.end_position.offset, node)
+
+
+# Should be removed when overlapping annotations of one type are allowed.
+class UASTAnnotation(UASTNodeAnnotation):
+    """Full UAST of the file annotation."""
+
+    uast = property(lambda self: self._node)
+
+
+class TokenAnnotation(Annotation):
+    """Virtual сode token annotation."""
+
+    def __init__(self, start: int, stop: int,
+                 uast_annotation: Optional[UASTNodeAnnotation] = None):
+        """
+        Initialization.
+
+        :param start: Annotation start.
+        :param stop: Annotation end.
+        :param uast_annotation: Related UASTNodeAnnotation Annotation if applicable.
+        """
+        super().__init__(start, stop)
+        self._uast_annotation = uast_annotation
+
+    uast_annotation = property(lambda self: self._uast_annotation)
+
+    @property
+    def node(self) -> "bblfsh.Node":
+        """
+        Get UAST Node from related UASTNodeAnnotation.
+
+        :return: related bblfsh UAST Node. None if there is no related annotation.
+        """
+        return self._uast_annotation.node if self._uast_annotation else None
+
+    @property
+    def has_node(self) -> bool:
+        """Check if token annotation has related UAST node annotation."""
+        return self._uast_annotation is None
+
+
+class LanguageAnnotation(Annotation):
+    """Language of the file annotation."""
+
+    def __init__(self, start: int, stop: int, language: str):
+        """Init."""
+        super().__init__(start, stop)
+        self._language = language
+
+    language = property(lambda self: self._language)
+
+
+class PathAnnotation(Annotation):
+    """File language annotation."""
+
+    def __init__(self, start: int, stop: int, path: str):
+        """Init."""
+        super().__init__(start, stop)
+        self._path = path
+
+    path = property(lambda self: self._path)
+
+
+class AccumulatedIntentationAnnotation(Annotation):
+    """Accumulated indentation annotation for the spaces in the beggining of the line."""
+
+
+class TargetAnnotation(Annotation):
+    """Target for model prediction annotation."""
+
+    def __init__(self, start: int, stop: int, target: Tuple[int, ...]):
+        """Init."""
+        super().__init__(start, stop)
+        self._target = target
+
+    target = property(lambda self: self._target)