Skip to content

Commit

Permalink
Add annotations
Browse files Browse the repository at this point in the history
  • Loading branch information
zurk committed Mar 7, 2019
1 parent 5dfad33 commit 0e55837
Show file tree
Hide file tree
Showing 9 changed files with 501 additions and 74 deletions.
7 changes: 7 additions & 0 deletions lookout/style/format/annotations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""
Annotation tool.
Inspired by https://uima.apache.org/d/uimafit-current/api/
"""

# TODO(zurk) move annotation module and tests to lookout-sdk-ml
201 changes: 201 additions & 0 deletions lookout/style/format/annotations/annotated_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
from typing import Dict, Iterable, Iterator, Optional, Sequence, Tuple, Union, Any, Type # noqa F401

from lookout.sdk.service_data_pb2 import File
from sortedcontainers import SortedDict

from lookout.style.format.annotations.annotations import Annotation, LanguageAnnotation, \
PathAnnotation, UASTAnnotation


class NoIntersection(Exception):
"""Raises by AnnotatedData.find_intersect() if there is no intersection."""


class AnnotationsSlice(dict):
"""
Annotations collection for a specific range.
"""

def __init__(self, start, stop, *args, **kwargs):
"""Init."""
super().__init__(*args, **kwargs)
self._range = (start, stop)
self._start = start
self._stop = stop

start = property(lambda self: self._start)

stop = property(lambda self: self._stop)

range = property(lambda self: self._range)


class AnnotatedData:
"""
Class that couples annotations and data together.
All special utilities to work with annotations should be implemented in this class
List of methods that should be implemented can be found here:
https://uima.apache.org/d/uimafit-current/api/org/apache/uima/fit/util/JCasUtil.html
"""

def __init__(self, content: str):
"""
Init.
:param content: Data to annotate. It is expected to be string but actually can be any type
with __getitem__() defined for int and slice input arguments.
"""
self._content = content

self._range_to_annotations = SortedDict() # type: SortedDict[(int, int), Dict[Type[Annotation], Annotation]] # noqa E501
self._type_to_annotations = {} # type: Dict[Type[Annotation], SortedDict[(int, int), Annotation]] # noqa E501

content = property(lambda self: self._content)

def __getitem__(self, item: Union[int, slice, Tuple[int, int]]) -> Any:
if isinstance(item, tuple):
item = slice(*item)
if isinstance(item, slice) and item.step is not None:
raise KeyError("slice.step is not supported.")
return self._content[item]

def add(self, annotation: Annotation) -> None:
"""
Add annotation.
"""
annotation_id = type(annotation)
# TODO(zurk): Add a check that there is no overlapping annotations of one type.
if annotation.range not in self._range_to_annotations:
self._range_to_annotations[annotation.range] = {}
if annotation_id not in self._type_to_annotations:
self._type_to_annotations[annotation_id] = SortedDict()
self._range_to_annotations[annotation.range][annotation_id] = annotation
self._type_to_annotations[annotation_id][annotation.range] = annotation

def update(self, annotations: Iterable[Annotation]) -> None:
"""
Update with annotations.
"""
for annotation in annotations:
self.add(annotation)

def iget(self, annotation_type: Type[Annotation], index: int) -> Annotation:
"""
Return an annotation and for given type and index.
"""
return self._type_to_annotations[annotation_type].peekitem(index)[1]

def iter_annotation(self, name: str, start_offset: Optional[int] = None,
stop_offset: Optional[int] = None) -> Iterator[Annotation]:
"""
Iterate through specific annotation atomic_tokens, ys, files, etc.
Returns slice of RawData and its annotation.
"""
if start_offset is not None or stop_offset is not None:
raise NotImplementedError()

for value in self._type_to_annotations[name].values():
yield value

def iter_annotations(self, types: Sequence[Type[Annotation]],
start_offset: Optional[int] = None, stop_offset: Optional[int] = None,
) -> Iterator[AnnotationsSlice]:
"""
Iterate through annotations with specified type.
:return: Requested annotations slice.
"""
if start_offset is not None or stop_offset is not None:
raise NotImplementedError()

types_set = frozenset(types)
for annotation0 in self.iter_annotation(types[0]):
# Annotations with the same range
same_range_annotations = self._range_to_annotations[annotation0.range]
same_range_names = set(same_range_annotations.keys())
common = types_set & same_range_names
missing = types_set - same_range_names
annotations = dict()
for type in missing:
try:
annotations[type] = self.find_intersect(type, *annotation0.range)
except NoIntersection:
pass
annotations.update({type: same_range_annotations[type] for type in common})
yield AnnotationsSlice(*annotation0.range, annotations)

def iter_items(self, types: Sequence[Type[Annotation]], start_offset: Optional[int] = None,
stop_offset: Optional[int] = None,
) -> Iterator[Tuple[str, AnnotationsSlice]]:
"""
Iterate through annotations with specified type.
:return: Annotated data slice with requested annotations.
"""
for annotations in self.iter_annotations(types, start_offset, stop_offset):
yield self[annotations.range], annotations

def find_intersect(self, name: str, start: int, stop: int) -> Annotation:
"""
Find an annotation of given type that intersects the interval [start, stop).
raises NoIntersection exception if there is no such annotation.
:param name: Annotation type.
:param start: start of interval.
:param stop: end of interval.
:return: requested Annotation.
"""
try:
annotation_layer = self._type_to_annotations[name]
except KeyError:
raise NoIntersection("There is no annotation layer %s" % name)
search_start = max(0, annotation_layer.bisect_left((start, start)) - 1)
search_stop = annotation_layer.bisect_right((stop, stop))
for range in annotation_layer.islice(search_start, search_stop):
if self._check_interval_crossing(start, stop, *range):
# assuming that there is only one such annotation
return annotation_layer[range]
raise NoIntersection("There is no annotation %s from %d to %d" % (name, start, stop))

@classmethod
def _check_interval_crossing(cls, start1: int, stop1: int, start2: int, stop2: int) -> bool:
# TODO(zurk): explain logic with [x, x) intervals.
if start1 == stop1:
if start2 == stop2:
return start1 == start2
else:
return start2 < start1 < stop2
else:
if start2 == stop2:
return start1 < start2 < stop1
else:
return (start1 <= start2 < stop1 or
start1 < stop2 < stop1 or
start2 <= start1 < stop2)

def subiter_annotation(self, name: str, covering_annotation: Annotation):
"""TODO."""
raise NotImplementedError()

def sub_iter_annotations(self, names: Sequence[str], covering_annotation: Annotation):
"""TODO."""
raise NotImplementedError()

@classmethod
def from_file(cls, file: File) -> "AnnotatedData":
"""
Create AnnotatedData instance from File.
:param file: file.content will be used as data to be annotated with \
file.path, file.language and file.uast.
:return: new AnnotatedData instance.
"""
raw_data = file.content.decode("utf-8", "replace")
annotated_data = AnnotatedData(raw_data)
annotated_data.add(PathAnnotation(0, len(raw_data), file.path))
annotated_data.add(UASTAnnotation(0, len(raw_data), file.uast))
annotated_data.add(LanguageAnnotation(0, len(raw_data), file.language))
return annotated_data
138 changes: 138 additions & 0 deletions lookout/style/format/annotations/annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""Annotations for style-analyzer."""
from typing import Optional, Tuple


class Annotation:
"""Base class for annotation."""

def __init__(self, start: int, stop: int):
"""
Initialization.
:param start: Annotation start.
:param stop: Annotation end.
"""
self._range = (start, stop)
self._start = start
self._stop = stop

start = property(lambda self: self._start)

stop = property(lambda self: self._stop)

range = property(lambda self: self._range)

name = property(lambda self: type(self).__name__)

def __repr__(self):
return self.__str__()

def __str__(self):
return "%s[%d, %d)" % (self.name, self.start, self.stop)


class AtomicTokenAnnotation(Annotation):
"""Infrangible сode token annotation."""


class LineAnnotation(Annotation):
"""Line number annotation."""

def __init__(self, start: int, stop: int, number: int):
"""Init."""
super().__init__(start, stop)
self._number = number

number = property(lambda self: self._number)


class UASTNodeAnnotation(Annotation):
"""UAST Node annotation."""

def __init__(self, start: int, stop: int, node: "bblfsh.Node"):
"""Init."""
super().__init__(start, stop)
self._node = node

node = property(lambda self: self._node)

@staticmethod
def from_node(node: "bblfsh.Node") -> "UASTNodeAnnotation":
"""Create the annotation from bblfsh node."""
return UASTNodeAnnotation(node.start_position.offset, node.end_position.offset, node)


# Should be removed when overlapping annotations of one type are allowed.
class UASTAnnotation(UASTNodeAnnotation):
"""Full UAST of the file annotation."""

uast = property(lambda self: self._node)


class TokenAnnotation(Annotation):
"""Virtual сode token annotation."""

def __init__(self, start: int, stop: int,
uast_annotation: Optional[UASTNodeAnnotation] = None):
"""
Initialization.
:param start: Annotation start.
:param stop: Annotation end.
:param uast_annotation: Related UASTNodeAnnotation Annotation if applicable.
"""
super().__init__(start, stop)
self._uast_annotation = uast_annotation

uast_annotation = property(lambda self: self._uast_annotation)

@property
def node(self) -> "bblfsh.Node":
"""
Get UAST Node from related UASTNodeAnnotation.
:return: related bblfsh UAST Node. None if there is no related annotation.
"""
return self._uast_annotation.node if self._uast_annotation else None

@property
def has_node(self) -> bool:
"""Check if token annotation has related UAST node annotation."""
return self._uast_annotation is None


class LanguageAnnotation(Annotation):
"""Language of the file annotation."""

def __init__(self, start: int, stop: int, language: str):
"""Init."""
super().__init__(start, stop)
self._language = language

language = property(lambda self: self._language)


class PathAnnotation(Annotation):
"""File language annotation."""

def __init__(self, start: int, stop: int, path: str):
"""Init."""
super().__init__(start, stop)
self._path = path

path = property(lambda self: self._path)


class AccumulatedIntentationAnnotation(Annotation):
"""Accumulated indentation annotation for the spaces in the beggining of the line."""


class TargetAnnotation(Annotation):
"""Target for model prediction annotation."""

def __init__(self, start: int, stop: int, target: Tuple[int, ...]):
"""Init."""
super().__init__(start, stop)
self._target = target

target = property(lambda self: self._target)
Loading

0 comments on commit 0e55837

Please sign in to comment.