Spaces:

ArneBinder
/

ScientificArgumentRecommender

Sleeping

App Files Files Community

ArneBinder commited on Mar 25

Commit

e7eaeed

verified ·

1 Parent(s): 6a6bb2a

upload https://github.com/ArneBinder/pie-document-level/pull/452

Browse files

Files changed (23) hide show

src/analysis/combine_job_returns.py +15 -11
src/analysis/show_score_distribution.py +99 -0
src/data/calc_iaa_for_brat.py +1 -0
src/datamodules/datamodule.py +12 -2
src/demo/annotation_utils.py +1 -0
src/demo/retrieve_and_dump_all_relevant.py +196 -0
src/demo/retriever_utils.py +51 -11
src/document/processing.py +247 -91
src/document/types.py +46 -0
src/metrics/__init__.py +1 -0
src/metrics/score_distribution.py +345 -0
src/models/__init__.py +1 -0
src/models/sequence_classification.py +94 -0
src/pipeline/ner_re_pipeline.py +99 -21
src/predict.py +8 -34
src/serializer/interface.py +2 -2
src/serializer/json.py +13 -7
src/start_demo.py +35 -14
src/taskmodules/cross_text_binary_coref_nli.py +31 -4
src/train.py +71 -10
src/utils/__init__.py +2 -1
src/utils/inference_utils.py +74 -0
src/utils/span_utils.py +14 -0

src/analysis/combine_job_returns.py CHANGED Viewed

@@ -47,6 +47,7 @@ def main(
     transpose: bool = False,
     unpack_multirun_results: bool = False,
     in_percent: bool = False,
 ):
     file_paths = get_file_paths(
         paths_file=paths_file, file_name=file_name, use_aggregated=use_aggregated
@@ -97,9 +98,6 @@ def main(
             data = data.unstack(index_name)
         data = data.T
-    if transpose:
-        data = data.T
     # needs to happen before rounding, otherwise the rounding will be off
     if in_percent:
         data = data * 100
@@ -107,20 +105,23 @@ def main(
     if round_precision is not None:
         data = data.round(round_precision)
-    if format == "markdown":
-        print(data.to_markdown())
-    elif format == "markdown_mean_and_std":
-        if transpose:
-            data = data.T
         if "mean" not in data.columns or "std" not in data.columns:
             raise ValueError("Columns 'mean' and 'std' are required for this format.")
         # create a single column with mean and std in the format: mean ± std
         data = pd.DataFrame(
             data["mean"].astype(str) + " ± " + data["std"].astype(str), columns=["mean ± std"]
         )
-        if transpose:
-            data = data.T
-        print(data.to_markdown())
     elif format == "json":
         print(data.to_json())
     else:
@@ -156,6 +157,9 @@ if __name__ == "__main__":
     parser.add_argument(
         "--in-percent", action="store_true", help="Show the values in percent (multiply by 100)"
     )
     parser.add_argument(
         "--format",
         type=str,

     transpose: bool = False,
     unpack_multirun_results: bool = False,
     in_percent: bool = False,
+    reset_index: bool = False,
 ):
     file_paths = get_file_paths(
         paths_file=paths_file, file_name=file_name, use_aggregated=use_aggregated
             data = data.unstack(index_name)
         data = data.T
     # needs to happen before rounding, otherwise the rounding will be off
     if in_percent:
         data = data * 100
     if round_precision is not None:
         data = data.round(round_precision)
+    # needs to happen before transposing
+    if format == "markdown_mean_and_std":
         if "mean" not in data.columns or "std" not in data.columns:
             raise ValueError("Columns 'mean' and 'std' are required for this format.")
         # create a single column with mean and std in the format: mean ± std
         data = pd.DataFrame(
             data["mean"].astype(str) + " ± " + data["std"].astype(str), columns=["mean ± std"]
         )
+    if transpose:
+        data = data.T
+    if reset_index:
+        data = data.reset_index()
+    if format in ["markdown", "markdown_mean_and_std"]:
+        print(data.to_markdown(index=not reset_index))
     elif format == "json":
         print(data.to_json())
     else:
     parser.add_argument(
         "--in-percent", action="store_true", help="Show the values in percent (multiply by 100)"
     )
+    parser.add_argument(
+        "--reset-index", action="store_true", help="Reset the index of the combined job returns"
+    )
     parser.add_argument(
         "--format",
         type=str,

src/analysis/show_score_distribution.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import pyrootutils
+root = pyrootutils.setup_root(
+    search_from=__file__,
+    indicator=[".project-root"],
+    pythonpath=True,
+    dotenv=False,
+)
+import argparse
+from typing import List, Optional
+import pandas as pd
+import plotly.figure_factory as ff
+from pie_datasets import DatasetDict
+pd.options.plotting.backend = "plotly"
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Show score distribution of annotations per layer"
+    )
+    # --data-dir predictions/default/2025-02-26_14-28-17
+    parser.add_argument(
+        "--data-dir", type=str, required=True, help="Path to the dataset directory"
+    )
+    parser.add_argument("--split", type=str, default="test", help="Dataset split to use")
+    parser.add_argument(
+        "--layers",
+        nargs="+",
+        default=["labeled_spans", "binary_relations"],
+        help="Annotation layers to use",
+    )
+    # --layer-captions ADUs "Argumentative Relations"
+    parser.add_argument(
+        "--layer-captions", nargs="+", help="Captions for the figure traces per layer"
+    )
+    # --layer-colors "rgb(31,119,180)" "rgb(255,127,14)"
+    parser.add_argument("--layer-colors", nargs="+", help="Colors for the figure traces per layer")
+    args = parser.parse_args()
+    # Load the dataset
+    ds = DatasetDict.from_json(data_dir=args.data_dir)[args.split]
+    # get scores per annotation layer and label
+    layers = args.layers
+    all_scores = []
+    all_scores_idx = []
+    for doc in ds:
+        for layer in layers:
+            for ann in doc[layer].predictions:
+                all_scores.append(ann.score)
+                all_scores_idx.append((doc.id, layer, getattr(ann, "label", None)))
+    scores = pd.Series(
+        all_scores,
+        index=pd.MultiIndex.from_tuples(all_scores_idx, names=["doc_id", "layer", "label"]),
+        name="score",
+    )
+    if args.layer_captions is not None:
+        if len(args.layer_captions) < len(layers):
+            raise ValueError("Not enough captions provided for all layers")
+        name_mapping = dict(zip(layers, args.layer_captions))
+    else:
+        name_mapping = dict(zip(layers, layers))
+    colors: Optional[List[str]] = None
+    if args.layer_colors is not None:
+        if len(args.layer_colors) < len(layers):
+            raise ValueError("Not enough colors provided for all layers")
+        color_mapping = dict(zip(layers, args.layer_colors))
+        colors = [color_mapping[layer] for layer in layers]
+    else:
+        colors = None
+    score_groups = {layer: scores.xs(layer, level="layer").to_numpy() for layer in layers}
+    group_labels, hist_data = zip(*score_groups.items())
+    group_labels_renamed = [name_mapping[label] for label in group_labels]
+    fig = ff.create_distplot(
+        hist_data,
+        group_labels=group_labels_renamed,
+        show_hist=True,
+        colors=colors,
+        bin_size=0.025,
+    )
+    fig.update_layout(
+        height=600,
+        width=800,
+        title_text="Score Distribution per Annotation Layer",
+        title_x=0.5,
+        barmode="group",
+    )
+    fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01))
+    fig.show()
+    print("done")

src/data/calc_iaa_for_brat.py CHANGED Viewed

@@ -92,6 +92,7 @@ def calc_brat_iaas(
             create_multi_spans=True,
             result_document_type=BratDocument,
             result_field_mapping={"spans": "spans", "relations": "relations"},
         )
     else:
         merger = None

             create_multi_spans=True,
             result_document_type=BratDocument,
             result_field_mapping={"spans": "spans", "relations": "relations"},
+            combine_scores_method="product",
         )
     else:
         merger = None

src/datamodules/datamodule.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import Any, Dict, Generic, Optional, Sequence, TypeVar, Union
 from pytorch_ie.core import Document
@@ -21,6 +22,8 @@ DatasetType: TypeAlias = Union[
     IterableTaskEncodingDataset[TaskEncoding[DocumentType, InputEncoding, TargetEncoding]],
 ]
 class PieDataModule(LightningDataModule, Generic[DocumentType, InputEncoding, TargetEncoding]):
     """A simple LightningDataModule for PIE document datasets.
@@ -49,6 +52,7 @@ class PieDataModule(LightningDataModule, Generic[DocumentType, InputEncoding, Ta
         test_split: Optional[str] = "test",
         show_progress_for_encode: bool = False,
         train_sampler: Optional[str] = None,
         **dataloader_kwargs,
     ):
         super().__init__()
@@ -62,6 +66,7 @@ class PieDataModule(LightningDataModule, Generic[DocumentType, InputEncoding, Ta
         self.show_progress_for_encode = show_progress_for_encode
         self.train_sampler_name = train_sampler
         self.dataloader_kwargs = dataloader_kwargs
         self._data: Dict[str, DatasetType] = {}
@@ -128,12 +133,17 @@ class PieDataModule(LightningDataModule, Generic[DocumentType, InputEncoding, Ta
             sampler = self.get_train_sampler(sampler_name=self.train_sampler_name, dataset=ds)
         else:
             sampler = None
         return DataLoader(
             dataset=ds,
             sampler=sampler,
             collate_fn=self.taskmodule.collate,
-            # don't shuffle streamed datasets or if we use a sampler
-            shuffle=not (isinstance(ds, IterableTaskEncodingDataset) or sampler is not None),
             **self.dataloader_kwargs,
         )

+import logging
 from typing import Any, Dict, Generic, Optional, Sequence, TypeVar, Union
 from pytorch_ie.core import Document
     IterableTaskEncodingDataset[TaskEncoding[DocumentType, InputEncoding, TargetEncoding]],
 ]
+logger = logging.getLogger(__name__)
 class PieDataModule(LightningDataModule, Generic[DocumentType, InputEncoding, TargetEncoding]):
     """A simple LightningDataModule for PIE document datasets.
         test_split: Optional[str] = "test",
         show_progress_for_encode: bool = False,
         train_sampler: Optional[str] = None,
+        dont_shuffle_train: bool = False,
         **dataloader_kwargs,
     ):
         super().__init__()
         self.show_progress_for_encode = show_progress_for_encode
         self.train_sampler_name = train_sampler
         self.dataloader_kwargs = dataloader_kwargs
+        self.dont_shuffle_train = dont_shuffle_train
         self._data: Dict[str, DatasetType] = {}
             sampler = self.get_train_sampler(sampler_name=self.train_sampler_name, dataset=ds)
         else:
             sampler = None
+        # don't shuffle streamed datasets or if we use a sampler or if we explicitly set dont_shuffle_train
+        shuffle = not self.dont_shuffle_train and not (
+            isinstance(ds, IterableTaskEncodingDataset) or sampler is not None
+        )
+        if not shuffle:
+            logger.warning("not shuffling train dataloader")
         return DataLoader(
             dataset=ds,
             sampler=sampler,
             collate_fn=self.taskmodule.collate,
+            shuffle=shuffle,
             **self.dataloader_kwargs,
         )

src/demo/annotation_utils.py CHANGED Viewed

@@ -37,6 +37,7 @@ def get_merger() -> SpansViaRelationMerger:
             "binary_relations": "binary_relations",
             "labeled_partitions": "labeled_partitions",
         },
     )

             "binary_relations": "binary_relations",
             "labeled_partitions": "labeled_partitions",
         },
+        combine_scores_method="product",
     )

src/demo/retrieve_and_dump_all_relevant.py CHANGED Viewed

@@ -10,9 +10,17 @@ root = pyrootutils.setup_root(
 import argparse
 import logging
 import os
 import pandas as pd
 from src.demo.retriever_utils import (
     retrieve_all_relevant_spans,
     retrieve_all_relevant_spans_for_all_documents,
@@ -23,6 +31,168 @@ from src.langchain_modules import DocumentAwareSpanRetrieverWithRelations
 logger = logging.getLogger(__name__)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -81,6 +251,19 @@ if __name__ == "__main__":
         '(each separated by ":") to retrieve spans for. If provided, '
         "--query_doc_id and --query_span_id are ignored.",
     )
     args = parser.parse_args()
     logging.basicConfig(
@@ -157,4 +340,17 @@ if __name__ == "__main__":
     os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
     all_spans_for_all_documents.to_json(args.output_path)
     logger.info("done")

 import argparse
 import logging
 import os
+from typing import Dict, List, Optional, Tuple
 import pandas as pd
+from pie_datasets import Dataset, DatasetDict
+from pytorch_ie import Annotation
+from pytorch_ie.annotations import BinaryRelation, MultiSpan, Span
+from document.types import (
+    RelatedRelation,
+    TextDocumentWithLabeledMultiSpansBinaryRelationsLabeledPartitionsAndRelatedRelations,
+)
 from src.demo.retriever_utils import (
     retrieve_all_relevant_spans,
     retrieve_all_relevant_spans_for_all_documents,
 logger = logging.getLogger(__name__)
+def get_original_doc_id_and_offsets(doc_id: str) -> Tuple[str, int, Optional[int]]:
+    original_doc_id, middle, start_end, ext = doc_id.split(".")
+    if middle == "remaining":
+        return original_doc_id, int(start_end), None
+    elif middle == "abstract":
+        start, end = start_end.split("_")
+        return original_doc_id, int(start), int(end)
+    else:
+        raise ValueError(f"unexpected doc_id format: {doc_id}")
+def add_base_annotations(
+    documents: Dict[
+        str, TextDocumentWithLabeledMultiSpansBinaryRelationsLabeledPartitionsAndRelatedRelations
+    ],
+    retrieved_doc_ids: List[str],
+    retriever: DocumentAwareSpanRetrieverWithRelations,
+) -> Dict[Tuple[str, Annotation], Tuple[str, Annotation]]:
+    # (retrieved_doc_id, retrieved_annotation) -> (original_doc_id, original_annotation)
+    annotation_mapping = {}
+    for retrieved_doc_id in retrieved_doc_ids:
+        pie_doc = retriever.get_document(retrieved_doc_id).metadata["pie_document"].copy()
+        original_doc_id, offset, _ = get_original_doc_id_and_offsets(retrieved_doc_id)
+        document = documents[original_doc_id]
+        span_mapping = {}
+        for span in pie_doc.labeled_multi_spans.predictions:
+            if isinstance(span, MultiSpan):
+                new_span = span.copy(
+                    slices=[(start + offset, end + offset) for start, end in span.slices]
+                )
+            elif isinstance(span, Span):
+                new_span = span.copy(start=span.start + offset, end=span.end + offset)
+            else:
+                raise ValueError(f"unexpected span type: {span}")
+            span_mapping[span] = new_span
+        document.labeled_multi_spans.predictions.extend(span_mapping.values())
+        for relation in pie_doc.binary_relations.predictions:
+            new_relation = relation.copy(
+                head=span_mapping[relation.head], tail=span_mapping[relation.tail]
+            )
+            document.binary_relations.predictions.append(new_relation)
+        for old_ann, new_ann in span_mapping.items():
+            annotation_mapping[(retrieved_doc_id, old_ann)] = (original_doc_id, new_ann)
+    return annotation_mapping
+def get_doc_and_span_id2annotation_mapping(
+    span_ids: pd.Series,
+    doc_ids: pd.Series,
+    retriever: DocumentAwareSpanRetrieverWithRelations,
+    base_annotation_mapping: Dict[Tuple[str, Annotation], Tuple[str, Annotation]],
+) -> Dict[Tuple[str, str], Tuple[str, Annotation]]:
+    if len(doc_ids) != len(span_ids):
+        raise ValueError("doc_ids and span_ids must have the same length")
+    doc_and_span_ids = zip(doc_ids.tolist(), span_ids.tolist())
+    return {
+        (doc_id, span_id): base_annotation_mapping[(doc_id, retriever.get_span_by_id(span_id))]
+        for doc_id, span_id in set(doc_and_span_ids)
+    }
+def add_result_to_gold_data(
+    result: pd.DataFrame,
+    gold_dataset_dir: str,
+    dataset_out_dir: str,
+    retriever: DocumentAwareSpanRetrieverWithRelations,
+    split: Optional[str] = None,
+    link_relation_label: str = "semantically_same",
+    reversed_relation_suffix: str = "_reversed",
+):
+    if not os.path.exists(gold_dataset_dir):
+        raise ValueError(f"gold dataset directory does not exist: {gold_dataset_dir}")
+    dataset_dict = DatasetDict.from_json(data_dir=gold_dataset_dir)
+    if split is None and len(dataset_dict) == 1:
+        split = list(dataset_dict.keys())[0]
+    if split is None:
+        raise ValueError("need to provide split name to add results to gold dataset")
+    dataset = dataset_dict[split]
+    doc_id2doc = {doc.id: doc for doc in dataset}
+    retriever_doc_ids = (
+        result["doc_id"].unique().tolist() + result["query_doc_id"].unique().tolist()
+    )
+    base_annotation_mapping = add_base_annotations(
+        documents=doc_id2doc, retrieved_doc_ids=retriever_doc_ids, retriever=retriever
+    )
+    # (retriever_doc_id, retriever_span_id) -> (original_doc_id, original_span)
+    doc_and_span_id2annotation = {}
+    doc_and_span_id2annotation.update(
+        get_doc_and_span_id2annotation_mapping(
+            span_ids=result["span_id"],
+            doc_ids=result["doc_id"],
+            retriever=retriever,
+            base_annotation_mapping=base_annotation_mapping,
+        )
+    )
+    doc_and_span_id2annotation.update(
+        get_doc_and_span_id2annotation_mapping(
+            span_ids=result["ref_span_id"],
+            doc_ids=result["doc_id"],
+            retriever=retriever,
+            base_annotation_mapping=base_annotation_mapping,
+        )
+    )
+    doc_and_span_id2annotation.update(
+        get_doc_and_span_id2annotation_mapping(
+            span_ids=result["query_span_id"],
+            doc_ids=result["query_doc_id"],
+            retriever=retriever,
+            base_annotation_mapping=base_annotation_mapping,
+        )
+    )
+    doc_id2head_tail2relation = {}
+    for doc_id, doc in doc_id2doc.items():
+        head_and_tail2relation = {}
+        for relation in doc.binary_relations.predictions:
+            head_and_tail2relation[(relation.head, relation.tail)] = relation
+        doc_id2head_tail2relation[doc_id] = head_and_tail2relation
+    for row in result.itertuples():
+        query_doc_id, query_span = doc_and_span_id2annotation[
+            (row.query_doc_id, row.query_span_id)
+        ]
+        doc_id, span = doc_and_span_id2annotation[(row.doc_id, row.span_id)]
+        doc_id2, ref_span = doc_and_span_id2annotation[(row.doc_id, row.ref_span_id)]
+        if doc_id != query_doc_id:
+            raise ValueError("doc_id and query_doc_id must be the same")
+        if doc_id != doc_id2:
+            raise ValueError("doc_id and ref_doc_id must be the same")
+        doc = doc_id2doc[doc_id]
+        link_rel = BinaryRelation(
+            head=query_span, tail=ref_span, label=link_relation_label, score=row.sim_score
+        )
+        doc.binary_relations.predictions.append(link_rel)
+        head_and_tail2relation = doc_id2head_tail2relation[doc_id]
+        related_rel_label = row.type
+        if related_rel_label.endswith(reversed_relation_suffix):
+            base_rel = head_and_tail2relation[(span, ref_span)]
+        else:
+            base_rel = head_and_tail2relation[(ref_span, span)]
+        related_rel = RelatedRelation(
+            head=query_span,
+            tail=span,
+            link_relation=link_rel,
+            relation=base_rel,
+            label=related_rel_label,
+            score=link_rel.score * base_rel.score,
+        )
+        doc.related_relations.predictions.append(related_rel)
+    dataset = Dataset.from_documents(list(doc_id2doc.values()))
+    dataset_dict = DatasetDict({split: dataset})
+    if not os.path.exists(dataset_out_dir):
+        os.makedirs(dataset_out_dir, exist_ok=True)
+    dataset_dict.to_json(dataset_out_dir)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
         '(each separated by ":") to retrieve spans for. If provided, '
         "--query_doc_id and --query_span_id are ignored.",
     )
+    parser.add_argument(
+        "--gold_dataset_dir",
+        type=str,
+        default=None,
+        help="If provided, add the spans and base relations from the retriever data as well "
+        "as the related relations to the gold dataset.",
+    )
+    parser.add_argument(
+        "--dataset_out_dir",
+        type=str,
+        default=None,
+        help="If provided, save the enriched gold dataset to this directory.",
+    )
     args = parser.parse_args()
     logging.basicConfig(
     os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
     all_spans_for_all_documents.to_json(args.output_path)
+    if args.gold_dataset_dir is not None:
+        logger.info(
+            f"reading gold data from {args.gold_dataset_dir} and adding results as predictions ..."
+        )
+        if args.dataset_out_dir is None:
+            raise ValueError("need to provide --dataset_out_dir to save the enriched dataset")
+        add_result_to_gold_data(
+            all_spans_for_all_documents,
+            gold_dataset_dir=args.gold_dataset_dir,
+            dataset_out_dir=args.dataset_out_dir,
+            retriever=retriever,
+        )
     logger.info("done")

src/demo/retriever_utils.py CHANGED Viewed

@@ -51,6 +51,7 @@ def load_retriever(
 def retrieve_similar_spans(
     retriever: DocumentAwareSpanRetriever,
     query_span_id: str,
     **kwargs,
 ) -> pd.DataFrame:
     if not query_span_id.strip():
@@ -60,21 +61,42 @@ def retrieve_similar_spans(
         records = []
         for similar_span_doc in retrieval_result:
             pie_doc, metadata = retriever.docstore.unwrap_with_metadata(similar_span_doc)
             span_ann = metadata["attached_span"]
             records.append(
                 {
                     "doc_id": pie_doc.id,
                     "span_id": similar_span_doc.id,
-                    "score": metadata["relevance_score"],
                     "label": span_ann.label,
                     "text": str(span_ann),
                 }
             )
-        return (
-            pd.DataFrame(records, columns=["doc_id", "score", "label", "text", "span_id"])
             .sort_values(by="score", ascending=False)
             .round(3)
         )
     except Exception as e:
         raise gr.Error(f"Failed to retrieve similar ADUs: {e}")
@@ -83,6 +105,7 @@ def retrieve_relevant_spans(
     retriever: DocumentAwareSpanRetriever,
     query_span_id: str,
     relation_label_mapping: Optional[dict[str, str]] = None,
     **kwargs,
 ) -> pd.DataFrame:
     if not query_span_id.strip():
@@ -98,40 +121,57 @@ def retrieve_relevant_spans(
             mapped_relation_label = relation_label_mapping.get(
                 metadata["relation_label"], metadata["relation_label"]
             )
             records.append(
                 {
                     "doc_id": pie_doc.id,
                     "type": mapped_relation_label,
-                    "rel_score": metadata["relation_score"],
                     "text": str(tail_span_ann),
                     "span_id": relevant_span_doc.id,
                     "label": tail_span_ann.label,
-                    "ref_score": metadata["relevance_score"],
                     "ref_label": span_ann.label,
                     "ref_text": str(span_ann),
                     "ref_span_id": metadata["head_id"],
                 }
             )
-        return (
             pd.DataFrame(
                 records,
                 columns=[
                     "type",
-                    # omitted for now, we get no valid relation scores for the generative model
-                    # "rel_score",
-                    "ref_score",
-                    "label",
                     "text",
                     "ref_label",
                     "ref_text",
                     "doc_id",
                     "span_id",
                     "ref_span_id",
                 ],
             )
-            .sort_values(by=["ref_score"], ascending=False)
             .round(3)
         )
     except Exception as e:
         raise gr.Error(f"Failed to retrieve relevant ADUs: {e}")

 def retrieve_similar_spans(
     retriever: DocumentAwareSpanRetriever,
     query_span_id: str,
+    min_score: float = 0.0,
     **kwargs,
 ) -> pd.DataFrame:
     if not query_span_id.strip():
         records = []
         for similar_span_doc in retrieval_result:
             pie_doc, metadata = retriever.docstore.unwrap_with_metadata(similar_span_doc)
+            query_span = retriever.get_span_by_id(span_id=query_span_id)
+            query_span_score = query_span.score
             span_ann = metadata["attached_span"]
+            sim_score = metadata["relevance_score"]
+            span_score = span_ann.score
+            score = query_span_score * sim_score * span_score
             records.append(
                 {
+                    "score": score,
                     "doc_id": pie_doc.id,
                     "span_id": similar_span_doc.id,
+                    "sim_score": sim_score,
+                    "query_span_score": query_span_score,
+                    "span_score": span_score,
                     "label": span_ann.label,
                     "text": str(span_ann),
                 }
             )
+        result = (
+            pd.DataFrame(
+                records,
+                columns=[
+                    "score",
+                    "text",
+                    "label",
+                    "sim_score",
+                    "span_score",
+                    "query_span_score",
+                    "doc_id",
+                    "span_id",
+                ],
+            )
             .sort_values(by="score", ascending=False)
             .round(3)
         )
+        return result[result["score"] >= min_score]
     except Exception as e:
         raise gr.Error(f"Failed to retrieve similar ADUs: {e}")
     retriever: DocumentAwareSpanRetriever,
     query_span_id: str,
     relation_label_mapping: Optional[dict[str, str]] = None,
+    min_score: float = 0.0,
     **kwargs,
 ) -> pd.DataFrame:
     if not query_span_id.strip():
             mapped_relation_label = relation_label_mapping.get(
                 metadata["relation_label"], metadata["relation_label"]
             )
+            query_span = retriever.get_span_by_id(span_id=query_span_id)
+            query_span_score = query_span.score
+            sim_score = metadata["relevance_score"]
+            ref_span_score = span_ann.score
+            rel_score = metadata["relation_score"]
+            span_score = tail_span_ann.score
+            score = query_span_score * sim_score * ref_span_score * rel_score * span_score
             records.append(
                 {
                     "doc_id": pie_doc.id,
                     "type": mapped_relation_label,
+                    "score": score,
+                    "rel_score": rel_score,
                     "text": str(tail_span_ann),
                     "span_id": relevant_span_doc.id,
+                    "span_score": span_score,
                     "label": tail_span_ann.label,
+                    "sim_score": sim_score,
                     "ref_label": span_ann.label,
                     "ref_text": str(span_ann),
                     "ref_span_id": metadata["head_id"],
+                    "ref_span_score": ref_span_score,
+                    "query_span_score": query_span_score,
                 }
             )
+        result = (
             pd.DataFrame(
                 records,
                 columns=[
+                    "score",
                     "type",
                     "text",
+                    "label",
+                    "rel_score",
+                    "sim_score",
                     "ref_label",
+                    "ref_span_score",
                     "ref_text",
                     "doc_id",
                     "span_id",
+                    "span_score",
                     "ref_span_id",
+                    "query_span_score",
                 ],
             )
+            .sort_values(by=["score"], ascending=False)
             .round(3)
         )
+        return result[result["score"] >= min_score]
     except Exception as e:
         raise gr.Error(f"Failed to retrieve relevant ADUs: {e}")

src/document/processing.py CHANGED Viewed

@@ -1,16 +1,20 @@
 from __future__ import annotations
 import logging
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, TypeVar
-from pie_modules.document.processing.merge_spans_via_relation import _merge_spans_via_relation
-from pie_modules.documents import TextDocumentWithLabeledMultiSpansAndBinaryRelations
 from pie_modules.utils.span import have_overlap
 from pytorch_ie import AnnotationLayer
 from pytorch_ie.core import Document
 from pytorch_ie.core.document import Annotation, _enumerate_dependencies
-from src.utils import distance
 from src.utils.span_utils import get_overlap_len
 logger = logging.getLogger(__name__)
@@ -68,58 +72,6 @@ def remove_overlapping_entities(
     return new_doc
-# TODO: remove and use pie_modules.document.processing.SpansViaRelationMerger instead
-def merge_spans_via_relation(
-    document: D,
-    relation_layer: str,
-    link_relation_label: str,
-    use_predicted_spans: bool = False,
-    process_predictions: bool = True,
-    create_multi_spans: bool = False,
-) -> D:
-    rel_layer = document[relation_layer]
-    span_layer = rel_layer.target_layer
-    new_gold_spans, new_gold_relations = _merge_spans_via_relation(
-        spans=span_layer,
-        relations=rel_layer,
-        link_relation_label=link_relation_label,
-        create_multi_spans=create_multi_spans,
-    )
-    if process_predictions:
-        new_pred_spans, new_pred_relations = _merge_spans_via_relation(
-            spans=span_layer.predictions if use_predicted_spans else span_layer,
-            relations=rel_layer.predictions,
-            link_relation_label=link_relation_label,
-            create_multi_spans=create_multi_spans,
-        )
-    else:
-        assert not use_predicted_spans
-        new_pred_spans = set(span_layer.predictions.clear())
-        new_pred_relations = set(rel_layer.predictions.clear())
-    relation_layer_name = relation_layer
-    span_layer_name = document[relation_layer].target_name
-    if create_multi_spans:
-        doc_dict = document.asdict()
-        for f in document.annotation_fields():
-            doc_dict.pop(f.name)
-        result = TextDocumentWithLabeledMultiSpansAndBinaryRelations.fromdict(doc_dict)
-        result.labeled_multi_spans.extend(new_gold_spans)
-        result.labeled_multi_spans.predictions.extend(new_pred_spans)
-        result.binary_relations.extend(new_gold_relations)
-        result.binary_relations.predictions.extend(new_pred_relations)
-    else:
-        result = document.copy(with_annotations=False)
-        result[span_layer_name].extend(new_gold_spans)
-        result[span_layer_name].predictions.extend(new_pred_spans)
-        result[relation_layer_name].extend(new_gold_relations)
-        result[relation_layer_name].predictions.extend(new_pred_relations)
-    return result
 def remove_partitions_by_labels(
     document: D, partition_layer: str, label_blacklist: List[str], span_layer: Optional[str] = None
 ) -> D:
@@ -249,31 +201,19 @@ def relabel_annotations(
 DWithSpans = TypeVar("DWithSpans", bound=Document)
-def align_predicted_span_annotations(
-    document: DWithSpans, span_layer: str, distance_type: str = "center", verbose: bool = False
-) -> DWithSpans:
-    """
-    Aligns predicted span annotations with the closest gold spans in a document.
-    First, calculates the distance between each predicted span and each gold span. Then,
-    for each predicted span, the gold span with the smallest distance is selected. If the
-    predicted span and the gold span have an overlap of at least half of the maximum length
-    of the two spans, the predicted span is aligned with the gold span.
-    Args:
-        document: The document to process.
-        span_layer: The name of the span layer.
-        distance_type: The type of distance to calculate. One of: center, inner, outer
-        verbose: Whether to print debug information.
-    Returns:
-        The processed document.
-    """
-    gold_spans = document[span_layer]
-    if len(gold_spans) == 0:
-        return document.copy()
-    pred_spans = document[span_layer].predictions
     old2new_pred_span = {}
     span_id2gold_span = {}
     for pred_span in pred_spans:
@@ -282,29 +222,32 @@ def align_predicted_span_annotations(
             (
                 gold_span,
                 distance(
-                    start_end=(pred_span.start, pred_span.end),
-                    other_start_end=(gold_span.start, gold_span.end),
                     distance_type=distance_type,
                 ),
             )
             for gold_span in gold_spans
         ]
         closest_gold_span, min_distance = min(gold_spans_with_distance, key=lambda x: x[1])
         # if the closest gold span is the same as the predicted span, we don't need to align
         if min_distance == 0.0:
             continue
         if have_overlap(
-            start_end=(pred_span.start, pred_span.end),
-            other_start_end=(closest_gold_span.start, closest_gold_span.end),
         ):
-            overlap_len = get_overlap_len(
-                (pred_span.start, pred_span.end), (closest_gold_span.start, closest_gold_span.end)
-            )
-            # get the maximum length of the two spans
             l_max = max(
-                pred_span.end - pred_span.start, closest_gold_span.end - closest_gold_span.start
             )
             # if the overlap is at least half of the maximum length, we consider it a valid match for alignment
             valid_match = overlap_len >= (l_max / 2)
@@ -312,12 +255,140 @@ def align_predicted_span_annotations(
             valid_match = False
         if valid_match:
-            aligned_pred_span = pred_span.copy(
-                start=closest_gold_span.start, end=closest_gold_span.end
-            )
             old2new_pred_span[pred_span._id] = aligned_pred_span
             span_id2gold_span[pred_span._id] = closest_gold_span
     result = document.copy(with_annotations=False)
     # multiple predicted spans can be aligned with the same gold span,
@@ -356,3 +427,88 @@ def align_predicted_span_annotations(
     )
     return result

 from __future__ import annotations
 import logging
+from collections import defaultdict
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, TypeVar, Union
 from pie_modules.utils.span import have_overlap
 from pytorch_ie import AnnotationLayer
+from pytorch_ie.annotations import LabeledMultiSpan, LabeledSpan, MultiSpan, Span
 from pytorch_ie.core import Document
 from pytorch_ie.core.document import Annotation, _enumerate_dependencies
+from src.document.types import (
+    RelatedRelation,
+    TextDocumentWithLabeledMultiSpansBinaryRelationsLabeledPartitionsAndRelatedRelations,
+)
+from src.utils import distance, distance_slices
 from src.utils.span_utils import get_overlap_len
 logger = logging.getLogger(__name__)
     return new_doc
 def remove_partitions_by_labels(
     document: D, partition_layer: str, label_blacklist: List[str], span_layer: Optional[str] = None
 ) -> D:
 DWithSpans = TypeVar("DWithSpans", bound=Document)
+def get_start_end(span: Union[Span, MultiSpan]) -> Tuple[int, int]:
+    if isinstance(span, Span):
+        return span.start, span.end
+    elif isinstance(span, MultiSpan):
+        starts, ends = zip(*span.slices)
+        return min(starts), max(ends)
+    else:
+        raise ValueError(f"Unsupported span type: {type(span)}")
+def _get_aligned_span_mappings(
+    gold_spans: Iterable[Span], pred_spans: Iterable[Span], distance_type: str
+) -> Tuple[Dict[int, Span], Dict[int, Span]]:
     old2new_pred_span = {}
     span_id2gold_span = {}
     for pred_span in pred_spans:
             (
                 gold_span,
                 distance(
+                    start_end=get_start_end(pred_span),
+                    other_start_end=get_start_end(gold_span),
                     distance_type=distance_type,
                 ),
             )
             for gold_span in gold_spans
         ]
+        if len(gold_spans_with_distance) == 0:
+            continue
         closest_gold_span, min_distance = min(gold_spans_with_distance, key=lambda x: x[1])
         # if the closest gold span is the same as the predicted span, we don't need to align
         if min_distance == 0.0:
             continue
+        pred_start_end = get_start_end(pred_span)
+        closest_gold_start_end = get_start_end(closest_gold_span)
         if have_overlap(
+            start_end=pred_start_end,
+            other_start_end=closest_gold_start_end,
         ):
+            overlap_len = get_overlap_len(pred_start_end, closest_gold_start_end)
             l_max = max(
+                pred_start_end[1] - pred_start_end[0],
+                closest_gold_start_end[1] - closest_gold_start_end[0],
             )
             # if the overlap is at least half of the maximum length, we consider it a valid match for alignment
             valid_match = overlap_len >= (l_max / 2)
             valid_match = False
         if valid_match:
+            if isinstance(pred_span, Span):
+                aligned_pred_span = pred_span.copy(
+                    start=closest_gold_span.start, end=closest_gold_span.end
+                )
+            elif isinstance(pred_span, MultiSpan):
+                aligned_pred_span = pred_span.copy(slices=closest_gold_span.slices)
+            else:
+                raise ValueError(f"Unsupported span type: {type(pred_span)}")
             old2new_pred_span[pred_span._id] = aligned_pred_span
             span_id2gold_span[pred_span._id] = closest_gold_span
+    return old2new_pred_span, span_id2gold_span
+def get_spans2multi_spans_mapping(multi_spans: Iterable[MultiSpan]) -> Dict[Span, MultiSpan]:
+    result = {}
+    for multi_span in multi_spans:
+        for start, end in multi_span.slices:
+            span_kwargs = dict(start=start, end=end, score=multi_span.score)
+            if isinstance(multi_span, LabeledMultiSpan):
+                result[LabeledSpan(label=multi_span.label, **span_kwargs)] = multi_span
+            else:
+                result[Span(**span_kwargs)] = multi_span
+    return result
+def align_predicted_span_annotations(
+    document: DWithSpans,
+    span_layer: str,
+    distance_type: str = "center",
+    simple_multi_span: bool = False,
+    verbose: bool = False,
+) -> DWithSpans:
+    """
+    Aligns predicted span annotations with the closest gold spans in a document.
+    First, calculates the distance between each predicted span and each gold span. Then,
+    for each predicted span, the gold span with the smallest distance is selected. If the
+    predicted span and the gold span have an overlap of at least half of the maximum length
+    of the two spans, the predicted span is aligned with the gold span.
+    This also works for MultiSpan annotations, where the slices of the MultiSpan are used
+    to align the predicted spans. If any of the slices is aligned with a gold slice,
+    the MultiSpan is aligned with the respective gold MultiSpan. However, this may result in
+    the predicted MultiSpan being aligned with multiple gold MultiSpans, in which case the
+    closest gold MultiSpan is selected. A simplified version of this alignment can be achieved
+    by setting `simple_multi_span=True`, which treats MultiSpan annotations as simple Spans
+    by using their maximum and minimum start and end indices.
+    Args:
+        document: The document to process.
+        span_layer: The name of the span layer.
+        distance_type: The type of distance to calculate. One of: center, inner, outer
+        simple_multi_span: Whether to treat MultiSpan annotations as simple Spans by using their
+            maximum and minimum start and end indices.
+        verbose: Whether to print debug information.
+    Returns:
+        The processed document.
+    """
+    gold_spans = document[span_layer]
+    if len(gold_spans) == 0:
+        return document.copy()
+    pred_spans = document[span_layer].predictions
+    span_annotation_type = document.annotation_types()[span_layer]
+    if issubclass(span_annotation_type, Span) or simple_multi_span:
+        old2new_pred_span, span_id2gold_span = _get_aligned_span_mappings(
+            gold_spans=gold_spans, pred_spans=pred_spans, distance_type=distance_type
+        )
+    elif issubclass(span_annotation_type, MultiSpan):
+        # create Span objects from MultiSpan slices
+        gold_single_spans2multi_spans = get_spans2multi_spans_mapping(gold_spans)
+        pred_single_spans2multi_spans = get_spans2multi_spans_mapping(pred_spans)
+        # create the alignment mappings for the single spans
+        single_old2new_pred_span, single_span_id2gold_span = _get_aligned_span_mappings(
+            gold_spans=gold_single_spans2multi_spans.keys(),
+            pred_spans=pred_single_spans2multi_spans.keys(),
+            distance_type=distance_type,
+        )
+        # collect all Spans that are part of the same MultiSpan
+        pred_multi_span2single_spans: Dict[MultiSpan, List[Span]] = defaultdict(list)
+        for pred_span, multi_span in pred_single_spans2multi_spans.items():
+            pred_multi_span2single_spans[multi_span].append(pred_span)
+        # create the new mappings for the MultiSpans
+        old2new_pred_span = {}
+        span_id2gold_span = {}
+        for pred_multi_span, pred_single_spans in pred_multi_span2single_spans.items():
+            # if any of the single spans is aligned with a gold span, align the multi span
+            if any(
+                pred_single_span._id in single_old2new_pred_span
+                for pred_single_span in pred_single_spans
+            ):
+                # get aligned gold multi spans
+                aligned_gold_multi_spans = set()
+                for pred_single_span in pred_single_spans:
+                    if pred_single_span._id in single_old2new_pred_span:
+                        aligned_gold_single_span = single_span_id2gold_span[pred_single_span._id]
+                        aligned_gold_multi_span = gold_single_spans2multi_spans[
+                            aligned_gold_single_span
+                        ]
+                        aligned_gold_multi_spans.add(aligned_gold_multi_span)
+                # calculate distances between the predicted multi span and the aligned gold multi spans
+                gold_multi_spans_with_distance = [
+                    (
+                        gold_multi_span,
+                        distance_slices(
+                            slices=pred_multi_span.slices,
+                            other_slices=gold_multi_span.slices,
+                            distance_type=distance_type,
+                        ),
+                    )
+                    for gold_multi_span in aligned_gold_multi_spans
+                ]
+                if len(aligned_gold_multi_spans) > 1:
+                    logger.warning(
+                        f"Multiple gold multi spans aligned with predicted multi span ({pred_multi_span}): "
+                        f"{aligned_gold_multi_spans}"
+                    )
+                # get the closest gold multi span
+                closest_gold_multi_span, min_distance = min(
+                    gold_multi_spans_with_distance, key=lambda x: x[1]
+                )
+                old2new_pred_span[pred_multi_span._id] = pred_multi_span.copy(
+                    slices=closest_gold_multi_span.slices
+                )
+                span_id2gold_span[pred_multi_span._id] = closest_gold_multi_span
+    else:
+        raise ValueError(f"Unsupported span annotation type: {span_annotation_type}")
     result = document.copy(with_annotations=False)
     # multiple predicted spans can be aligned with the same gold span,
     )
     return result
+def add_related_relations_from_binary_relations(
+    document: TextDocumentWithLabeledMultiSpansBinaryRelationsLabeledPartitionsAndRelatedRelations,
+    link_relation_label: str,
+    link_partition_whitelist: Optional[List[List[str]]] = None,
+    relation_label_whitelist: Optional[List[str]] = None,
+    reversed_relation_suffix: str = "_reversed",
+    symmetric_relations: Optional[List[str]] = None,
+) -> TextDocumentWithLabeledMultiSpansBinaryRelationsLabeledPartitionsAndRelatedRelations:
+    span2partition = {}
+    for multi_span in document.labeled_multi_spans:
+        found_partition = False
+        for partition in document.labeled_partitions or [
+            LabeledSpan(start=0, end=len(document.text), label="ALL")
+        ]:
+            starts, ends = zip(*multi_span.slices)
+            if partition.start <= min(starts) and max(ends) <= partition.end:
+                span2partition[multi_span] = partition
+                found_partition = True
+                break
+        if not found_partition:
+            raise ValueError(f"No partition found for multi_span {multi_span}")
+    rel_head2rels = defaultdict(list)
+    rel_tail2rels = defaultdict(list)
+    for rel in document.binary_relations:
+        rel_head2rels[rel.head].append(rel)
+        rel_tail2rels[rel.tail].append(rel)
+    link_partition_whitelist_tuples = None
+    if link_partition_whitelist is not None:
+        link_partition_whitelist_tuples = {tuple(pair) for pair in link_partition_whitelist}
+    skipped_labels = []
+    for link_rel in document.binary_relations:
+        if link_rel.label == link_relation_label:
+            head_partition = span2partition[link_rel.head]
+            tail_partition = span2partition[link_rel.tail]
+            if link_partition_whitelist_tuples is None or (
+                (head_partition.label, tail_partition.label) in link_partition_whitelist_tuples
+            ):
+                # link_head -> link_tail == rel_head -> rel_tail
+                for rel in rel_head2rels.get(link_rel.tail, []):
+                    label = rel.label
+                    if relation_label_whitelist is None or label in relation_label_whitelist:
+                        new_rel = RelatedRelation(
+                            head=link_rel.head,
+                            tail=rel.tail,
+                            link_relation=link_rel,
+                            relation=rel,
+                            label=label,
+                        )
+                        document.related_relations.append(new_rel)
+                    else:
+                        skipped_labels.append(label)
+                # link_head -> link_tail == rel_tail -> rel_head
+                if reversed_relation_suffix is not None:
+                    for reversed_rel in rel_tail2rels.get(link_rel.tail, []):
+                        label = reversed_rel.label
+                        if not (symmetric_relations is not None and label in symmetric_relations):
+                            label = f"{label}{reversed_relation_suffix}"
+                        if relation_label_whitelist is None or label in relation_label_whitelist:
+                            new_rel = RelatedRelation(
+                                head=link_rel.head,
+                                tail=reversed_rel.head,
+                                link_relation=link_rel,
+                                relation=reversed_rel,
+                                label=label,
+                            )
+                            document.related_relations.append(new_rel)
+                        else:
+                            skipped_labels.append(label)
+            else:
+                logger.warning(
+                    f"Skipping related relation because of partition whitelist ({[head_partition.label, tail_partition.label]}): {link_rel.resolve()}"
+                )
+    if len(skipped_labels) > 0:
+        logger.warning(
+            f"Skipped relations with labels not in whitelist: {sorted(set(skipped_labels))}"
+        )
+    return document

src/document/types.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import dataclasses
+from pytorch_ie import AnnotationLayer, annotation_field
+from pytorch_ie.annotations import BinaryRelation
+from pytorch_ie.documents import (
+    TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+)
+@dataclasses.dataclass(eq=True, frozen=True)
+class RelatedRelation(BinaryRelation):
+    link_relation: BinaryRelation = dataclasses.field(default=None, compare=False)
+    relation: BinaryRelation = dataclasses.field(default=None, compare=False)
+    def __post_init__(self):
+        super().__post_init__()
+        # check if the reference_span is correct
+        self.reference_span
+    @property
+    def reference_span(self):
+        if self.link_relation is None:
+            raise ValueError(
+                "No semantically_same_relation available, cannot return reference_span"
+            )
+        if self.link_relation.head == self.head:
+            return self.link_relation.tail
+        elif self.link_relation.tail == self.head:
+            return self.link_relation.head
+        elif self.link_relation.head == self.tail:
+            return self.link_relation.tail
+        elif self.link_relation.tail == self.tail:
+            return self.link_relation.head
+        else:
+            raise ValueError(
+                "The semantically_same_relation is neither linked to head nor tail of the current relation"
+            )
+@dataclasses.dataclass
+class TextDocumentWithLabeledMultiSpansBinaryRelationsLabeledPartitionsAndRelatedRelations(
+    TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+):
+    related_relations: AnnotationLayer[RelatedRelation] = annotation_field(
+        targets=["labeled_multi_spans", "binary_relations"]
+    )

src/metrics/__init__.py CHANGED Viewed

@@ -1,2 +1,3 @@
 from .coref_sklearn import CorefMetricsSKLearn
 from .coref_torchmetrics import CorefMetricsTorchmetrics

 from .coref_sklearn import CorefMetricsSKLearn
 from .coref_torchmetrics import CorefMetricsTorchmetrics
+from .score_distribution import ScoreDistribution

src/metrics/score_distribution.py ADDED Viewed

	@@ -0,0 +1,345 @@

+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Tuple
+import pandas as pd
+from pytorch_ie import Document, DocumentMetric
+class ScoreDistribution(DocumentMetric):
+    """Computes the distribution of prediction scores for annotations in a layer. The scores are
+    separated into true positives (TP) and false positives (FP) based on the gold annotations.
+    Args:
+        layer: The name of the annotation layer to analyze.
+        per_label: If True, the scores are separated per label. Default is False.
+        label_field: The field name of the label to use for separating the scores per label. Default is "label".
+        equal_sample_size_binning: If True, the scores are binned into equal sample sizes. If False,
+            the scores are binned into equal width. The former is useful when the distribution of scores is skewed.
+            Default is True.
+        show_plot: If True, a plot of the score distribution is shown. Default is False.
+        plotting_backend: The plotting backend to use. Default is "plotly".
+        plotting_caption_mapping: A mapping to rename any caption entries for plotting, i.e., the layer name,
+            labels, or TP/FP. Default is None.
+        plotting_colors: A dictionary mapping from gold scores to colors for plotting. Default is None.
+    """
+    def __init__(
+        self,
+        layer: str,
+        label_field: str = "label",
+        per_label: bool = False,
+        show_plot: bool = False,
+        equal_sample_size_binning: bool = True,
+        plotting_backend: str = "plotly",
+        plotting_caption_mapping: Optional[Dict[str, str]] = None,
+        plotting_colors: Optional[Dict[str, str]] = None,
+        plotly_use_create_distplot: bool = True,
+        plotly_barmode: Optional[str] = None,
+        plotly_marginal: Optional[str] = "violin",
+        plotly_font_size: int = 18,
+        plotly_font_family: Optional[str] = None,
+        plotly_background_color: Optional[str] = None,
+    ):
+        super().__init__()
+        self.layer = layer
+        self.label_field = label_field
+        self.per_label = per_label
+        self.equal_sample_size_binning = equal_sample_size_binning
+        self.plotting_backend = plotting_backend
+        self.show_plot = show_plot
+        self.plotting_caption_mapping = plotting_caption_mapping or {}
+        self.plotting_colors = plotting_colors
+        self.plotly_use_create_distplot = plotly_use_create_distplot
+        self.plotly_barmode = plotly_barmode
+        self.plotly_marginal = plotly_marginal
+        self.plotly_font_size = plotly_font_size
+        self.plotly_font_family = plotly_font_family
+        self.plotly_background_color = plotly_background_color
+        self.scores: Dict[str, Dict[str, List[float]]] = defaultdict(lambda: defaultdict(list))
+    def reset(self):
+        self.scores = defaultdict(lambda: defaultdict(list))
+    def _update(self, document: Document):
+        gold_annotations = set(document[self.layer])
+        for ann in document[self.layer].predictions:
+            if self.per_label:
+                label = getattr(ann, self.label_field)
+            else:
+                label = "ALL"
+            if ann in gold_annotations:
+                self.scores[label]["TP"].append(ann.score)
+            else:
+                self.scores[label]["FP"].append(ann.score)
+    def _combine_scores(
+        self,
+        scores_tp: List[float],
+        score_fp: List[float],
+        col_name_pred: str = "prediction",
+        col_name_gold: str = "gold",
+    ) -> pd.DataFrame:
+        scores_tp_df = pd.DataFrame(scores_tp, columns=[col_name_pred])
+        scores_tp_df[col_name_gold] = 1.0
+        scores_fp_df = pd.DataFrame(score_fp, columns=[col_name_pred])
+        scores_fp_df[col_name_gold] = 0.0
+        scores_df = pd.concat([scores_tp_df, scores_fp_df])
+        return scores_df
+    def _get_calibration_data_and_metrics(
+        self, scores: pd.DataFrame, q: int = 20
+    ) -> Tuple[pd.DataFrame, pd.Series]:
+        from sklearn.metrics import brier_score_loss
+        if self.equal_sample_size_binning:
+            # Create bins with equal number of samples.
+            scores["bin"] = pd.qcut(scores["prediction"], q=q, labels=False)
+        else:
+            # Create bins with equal width.
+            scores["bin"] = pd.cut(
+                scores["prediction"],
+                bins=q,
+                include_lowest=True,
+                right=True,
+                labels=False,
+            )
+        calibration_data = (
+            scores.groupby("bin")
+            .apply(
+                lambda x: pd.Series(
+                    {
+                        "avg_score": x["prediction"].mean(),
+                        "fraction_positive": x["gold"].mean(),
+                        "count": len(x),
+                    }
+                )
+            )
+            .reset_index()
+        )
+        total_count = scores.shape[0]
+        calibration_data["bin_weight"] = calibration_data["count"] / total_count
+        # Calculate the absolute differences and squared differences.
+        calibration_data["abs_diff"] = abs(
+            calibration_data["avg_score"] - calibration_data["fraction_positive"]
+        )
+        calibration_data["squared_diff"] = (
+            calibration_data["avg_score"] - calibration_data["fraction_positive"]
+        ) ** 2
+        # Compute Expected Calibration Error (ECE): weighted average of absolute differences.
+        ece = (calibration_data["abs_diff"] * calibration_data["bin_weight"]).sum()
+        # Compute Maximum Calibration Error (MCE): maximum absolute difference.
+        mce = calibration_data["abs_diff"].max()
+        # Compute Mean Squared Error (MSE): weighted average of squared differences.
+        mse = (calibration_data["squared_diff"] * calibration_data["bin_weight"]).sum()
+        # Compute the Brier Score on the raw predictions.
+        brier = brier_score_loss(scores["gold"], scores["prediction"])
+        values = {
+            "ece": ece,
+            "mce": mce,
+            "mse": mse,
+            "brier": brier,
+        }
+        return calibration_data, pd.Series(values)
+    def calculate_calibration_metrics(self, scores_combined: pd.DataFrame) -> pd.DataFrame:
+        calibration_data_dict = {}
+        calibration_metrics_dict = {}
+        for label, current_scores in scores_combined.groupby("label"):
+            calibration_data, calibration_metrics = self._get_calibration_data_and_metrics(
+                current_scores, q=20
+            )
+            calibration_data_dict[label] = calibration_data
+            calibration_metrics_dict[label] = calibration_metrics
+        all_calibration_data = pd.concat(
+            calibration_data_dict, names=["label", "idx"]
+        ).reset_index(level=0)
+        all_calibration_metrics = pd.concat(calibration_metrics_dict, axis=1).T
+        if self.show_plot:
+            self.plot_calibration_data(calibration_data=all_calibration_data)
+        return all_calibration_metrics
+    def calculate_correlation(self, scores: pd.DataFrame) -> pd.Series:
+        result_dict = {}
+        for label, current_scores in scores.groupby("label"):
+            result_dict[label] = current_scores.drop("label", axis=1).corr()["prediction"]["gold"]
+        return pd.Series(result_dict, name="correlation")
+    @property
+    def mapped_layer(self):
+        return self.plotting_caption_mapping.get(self.layer, self.layer)
+    def plot_score_distribution(self, scores: pd.DataFrame):
+        if self.plotting_backend == "plotly":
+            for label in scores["label"].unique():
+                description = f"Distribution of Predicted Scores for {self.mapped_layer}"
+                if self.per_label:
+                    label_mapped = self.plotting_caption_mapping.get(label, label)
+                    description += f" ({label_mapped})"
+                if self.plotly_use_create_distplot:
+                    import plotly.figure_factory as ff
+                    current_scores = scores[scores["label"] == label]
+                    # group by gold score
+                    scores_dict = (
+                        current_scores.groupby("gold")["prediction"].apply(list).to_dict()
+                    )
+                    group_labels, hist_data = zip(*scores_dict.items())
+                    group_labels_renamed = [
+                        self.plotting_caption_mapping.get(label, label) for label in group_labels
+                    ]
+                    if self.plotting_colors is not None:
+                        colors = [
+                            self.plotting_colors[group_label] for group_label in group_labels
+                        ]
+                    else:
+                        colors = None
+                    fig = ff.create_distplot(
+                        hist_data,
+                        group_labels=group_labels_renamed,
+                        show_hist=True,
+                        colors=colors,
+                        bin_size=0.025,
+                    )
+                else:
+                    import plotly.express as px
+                    fig = px.histogram(
+                        scores,
+                        x="prediction",
+                        color="gold",
+                        marginal=self.plotly_marginal,  # "violin",  # or box, violin, rug
+                        hover_data=scores.columns,
+                        color_discrete_map=self.plotting_colors,
+                        nbins=50,
+                    )
+                fig.update_layout(
+                    height=600,
+                    width=800,
+                    title_text=description,
+                    title_x=0.5,
+                    font=dict(size=self.plotly_font_size),
+                    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
+                )
+                if self.plotly_barmode is not None:
+                    fig.update_layout(barmode=self.plotly_barmode)
+                if self.plotly_font_family is not None:
+                    fig.update_layout(font_family=self.plotly_font_family)
+                if self.plotly_background_color is not None:
+                    fig.update_layout(
+                        plot_bgcolor=self.plotly_background_color,
+                        paper_bgcolor=self.plotly_background_color,
+                    )
+                fig.show()
+        else:
+            raise NotImplementedError(f"Plotting backend {self.plotting_backend} not implemented")
+    def plot_calibration_data(self, calibration_data: pd.DataFrame):
+        import plotly.express as px
+        import plotly.graph_objects as go
+        color = "label" if self.per_label else None
+        x_col = "avg_score"
+        y_col = "fraction_positive"
+        fig = px.scatter(
+            calibration_data,
+            x=x_col,
+            y=y_col,
+            color=color,
+            trendline="ols",
+            labels=self.plotting_caption_mapping,
+        )
+        if not self.per_label:
+            fig["data"][1]["name"] = "prediction vs. gold"
+        # show legend only for trendlines
+        for idx, trace_data in enumerate(fig["data"]):
+            if idx % 2 == 0:
+                trace_data["showlegend"] = False
+            else:
+                trace_data["showlegend"] = True
+        # add the optimal line
+        minimum = calibration_data[x_col].min()
+        maximum = calibration_data[x_col].max()
+        fig.add_trace(
+            go.Scatter(
+                x=[minimum, maximum],
+                y=[minimum, maximum],
+                mode="lines",
+                name="optimal",
+                line=dict(color="black", dash="dash"),
+            )
+        )
+        fig.update_layout(
+            height=600,
+            width=800,
+            title_text=f"Mean Binned Scores for {self.mapped_layer}",
+            title_x=0.5,
+            font=dict(size=self.plotly_font_size),
+        )
+        fig.update_layout(
+            legend=dict(
+                yanchor="top",
+                y=0.99,
+                xanchor="left",
+                x=0.01,
+                title="OLS trendline" + ("s" if self.per_label else ""),
+            ),
+        )
+        if self.plotly_background_color is not None:
+            fig.update_layout(
+                plot_bgcolor=self.plotly_background_color,
+                paper_bgcolor=self.plotly_background_color,
+            )
+        if self.plotly_font_family is not None:
+            fig.update_layout(font_family=self.plotly_font_family)
+        fig.show()
+    def _compute(self) -> Dict[str, Dict[str, Any]]:
+        scores_combined = pd.concat(
+            {
+                label: self._combine_scores(scores["TP"], scores["FP"])
+                for label, scores in self.scores.items()
+            },
+            names=["label", "idx"],
+        ).reset_index(level=0)
+        result_df = scores_combined.groupby("label")["prediction"].agg(["mean", "std", "count"])
+        if self.show_plot:
+            self.plot_score_distribution(scores=scores_combined)
+        calibration_metrics = self.calculate_calibration_metrics(scores_combined)
+        calibration_metrics["correlation"] = self.calculate_correlation(scores_combined)
+        result_df = pd.concat(
+            {"prediction": result_df, "prediction vs. gold": calibration_metrics}, axis=1
+        )
+        if not self.per_label:
+            result = result_df.xs("ALL")
+        else:
+            result = result_df.T.stack().unstack()
+        result_dict = {
+            main_key: result.xs(main_key).T.to_dict()
+            for main_key in result.index.get_level_values(0).unique()
+        }
+        return result_dict

src/models/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from .sequence_classification_with_pooler import (
     SequencePairSimilarityModelWithMaxCosineSim,
     SequencePairSimilarityModelWithPooler2,

+from .sequence_classification import SimpleSequenceClassificationModelWithInputTypeIds
 from .sequence_classification_with_pooler import (
     SequencePairSimilarityModelWithMaxCosineSim,
     SequencePairSimilarityModelWithPooler2,

src/models/sequence_classification.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from typing import Optional
+from pie_modules.models import SimpleSequenceClassificationModel
+from pie_modules.models.simple_sequence_classification import InputType, OutputType, TargetType
+from pytorch_ie import PyTorchIEModel
+from torch import nn
+from transformers import BertModel
+from transformers.utils import is_accelerate_available
+if is_accelerate_available():
+    from accelerate.hooks import add_hook_to_module
+@PyTorchIEModel.register()
+class SimpleSequenceClassificationModelWithInputTypeIds(SimpleSequenceClassificationModel):
+    def __init__(
+        self, num_token_type_ids: int, use_as_token_type_ids: str = "token_type_ids", **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.num_token_type_ids = num_token_type_ids
+        self.token_type_ids_key = use_as_token_type_ids
+        self.resize_type_embeddings(num_token_type_ids)
+    def get_input_type_embeddings(self) -> nn.Module:
+        base_model: BertModel = getattr(self.model, self.model.base_model_prefix)
+        if base_model is None:
+            raise ValueError("Model has no base model.")
+        return base_model.embeddings.token_type_embeddings
+    def set_input_type_embeddings(self, value):
+        base_model: BertModel = getattr(self.model, self.model.base_model_prefix)
+        if base_model is None:
+            raise ValueError("Model has no base model.")
+        base_model.embeddings.token_type_embeddings = value
+    def _resize_type_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
+        old_embeddings = self.get_input_type_embeddings()
+        new_embeddings = self.model._get_resized_embeddings(
+            old_embeddings, new_num_tokens, pad_to_multiple_of
+        )
+        if hasattr(old_embeddings, "_hf_hook"):
+            hook = old_embeddings._hf_hook
+            add_hook_to_module(new_embeddings, hook)
+        old_embeddings_requires_grad = old_embeddings.weight.requires_grad
+        new_embeddings.requires_grad_(old_embeddings_requires_grad)
+        self.set_input_type_embeddings(new_embeddings)
+        return self.get_input_type_embeddings()
+    def resize_type_embeddings(
+        self, new_num_types: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
+    ) -> nn.Embedding:
+        """
+        Same as resize_token_embeddings but for the token type embeddings.
+        Resizes input token type embeddings matrix of the model if `new_num_types != config.type_vocab_size`.
+        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+        Arguments:
+            new_num_types (`int`, *optional*):
+                The number of new token types in the embedding matrix. Increasing the size will add newly initialized
+                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
+                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the embedding matrix to a multiple of the provided value.If `new_num_tokens` is set to
+                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
+                details about this, or help on choosing the correct value for resizing, refer to this guide:
+                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
+        Return:
+            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
+        """
+        model_embeds = self._resize_type_embeddings(new_num_types, pad_to_multiple_of)
+        if new_num_types is None and pad_to_multiple_of is None:
+            return model_embeds
+        # Update base model and current model config
+        self.model.config.type_vocab_size = model_embeds.weight.shape[0]
+        # Tie weights again if needed
+        self.model.tie_weights()
+        return model_embeds
+    def forward(self, inputs: InputType, targets: Optional[TargetType] = None) -> OutputType:
+        kwargs = {**inputs, **(targets or {})}
+        # rename key to input_type_ids
+        kwargs["token_type_ids"] = kwargs.pop(self.token_type_ids_key)
+        return self.model(**kwargs)

src/pipeline/ner_re_pipeline.py CHANGED Viewed

@@ -15,6 +15,7 @@ from typing import (
     overload,
 )
 from pie_modules.utils import resolve_type
 from pytorch_ie import AutoPipeline, WithDocumentTypeMixin
 from pytorch_ie.core import Document
@@ -53,31 +54,105 @@ def move_annotations_to_predictions(doc: D, layer_names: List[str]) -> None:
         doc[layer_name].predictions.extend(annotations)
 def add_annotations_from_other_documents(
     docs: Iterable[D],
     other_docs: Sequence[Document],
-    layer_names: List[str],
-    from_predictions: bool = False,
-    to_predictions: bool = False,
-    clear_before: bool = True,
-) -> None:
-    for i, doc in enumerate(docs):
-        other_doc = other_docs[i]
-        # copy to not modify the input
-        other_doc = type(other_doc).fromdict(other_doc.asdict())
-        for layer_name in layer_names:
-            if clear_before:
-                doc[layer_name].clear()
-            other_layer = other_doc[layer_name]
-            if from_predictions:
-                other_layer = other_layer.predictions
-            other_annotations = list(other_layer)
-            other_layer.clear()
-            if to_predictions:
-                doc[layer_name].predictions.extend(other_annotations)
             else:
-                doc[layer_name].extend(other_annotations)
 def process_pipeline_steps(
@@ -227,6 +302,9 @@ class NerRePipeline:
                 "re_add_gold_data": partial(
                     add_annotations_from_other_documents,
                     other_docs=original_docs,
                     layer_names=[self.entity_layer, self.relation_layer],
                     **self.processor_kwargs.get("re_add_gold_data", {}),
                 ),

     overload,
 )
+from pie_datasets import Dataset
 from pie_modules.utils import resolve_type
 from pytorch_ie import AutoPipeline, WithDocumentTypeMixin
 from pytorch_ie.core import Document
         doc[layer_name].predictions.extend(annotations)
+def _add_annotations_from_other_document(
+    doc: D,
+    from_predictions: bool,
+    to_predictions: bool,
+    clear_before: bool,
+    other_doc: Optional[D] = None,
+    other_docs_dict: Optional[Dict[str, D]] = None,
+    layer_names: Optional[List[str]] = None,
+) -> D:
+    if other_doc is None:
+        if other_docs_dict is None:
+            raise ValueError("Either other_doc or other_docs_dict must be provided")
+        other_doc = other_docs_dict.get(doc.id)
+        if other_doc is None:
+            logger.warning(f"Document with ID {doc.id} not found in other_docs")
+            return doc
+    # copy to not modify the input
+    other_doc_copy = type(other_doc).fromdict(other_doc.asdict())
+    if layer_names is None:
+        layer_names = [field.name for field in doc.annotation_fields()]
+    for layer_name in layer_names:
+        layer = doc[layer_name]
+        if to_predictions:
+            layer = layer.predictions
+        if clear_before:
+            layer.clear()
+        other_layer = other_doc_copy[layer_name]
+        if from_predictions:
+            other_layer = other_layer.predictions
+        other_annotations = list(other_layer)
+        other_layer.clear()
+        layer.extend(other_annotations)
+    return doc
 def add_annotations_from_other_documents(
     docs: Iterable[D],
     other_docs: Sequence[Document],
+    get_other_doc_by_id: bool = False,
+    **kwargs,
+) -> Sequence[D]:
+    other_id2doc = None
+    if get_other_doc_by_id:
+        other_id2doc = {doc.id: doc for doc in other_docs}
+    if isinstance(docs, Dataset):
+        if other_id2doc is None:
+            raise ValueError("get_other_doc_by_id must be True when passing a Dataset")
+        result = docs.map(
+            _add_annotations_from_other_document,
+            fn_kwargs=dict(other_docs_dict=other_id2doc, **kwargs),
+        )
+    elif isinstance(docs, list):
+        result = []
+        for i, doc in enumerate(docs):
+            if other_id2doc is not None:
+                other_doc = other_id2doc.get(doc.id)
+                if other_doc is None:
+                    logger.warning(f"Document with ID {doc.id} not found in other_docs")
+                    continue
             else:
+                other_doc = other_docs[i]
+                # check if the IDs of the documents match
+                doc_id = getattr(doc, "id", None)
+                other_doc_id = getattr(other_doc, "id", None)
+                if doc_id is not None and doc_id != other_doc_id:
+                    raise ValueError(
+                        f"IDs of the documents do not match: {doc_id} != {other_doc_id}"
+                    )
+            current_result = _add_annotations_from_other_document(
+                doc, other_doc=other_doc, **kwargs
+            )
+            result.append(current_result)
+    else:
+        raise ValueError(f"Unsupported type: {type(docs)}")
+    return result
+DM = TypeVar("DM", bound=Dict[str, Iterable[Document]])
+def add_annotations_from_other_documents_dict(
+    docs: DM, other_docs: Dict[str, Sequence[Document]], **kwargs
+) -> DM:
+    if set(docs.keys()) != set(other_docs.keys()):
+        raise ValueError("Keys of the documents do not match")
+    result_dict = {
+        key: add_annotations_from_other_documents(doc_list, other_docs[key], **kwargs)
+        for key, doc_list in docs.items()
+    }
+    return type(docs)(result_dict)
 def process_pipeline_steps(
                 "re_add_gold_data": partial(
                     add_annotations_from_other_documents,
                     other_docs=original_docs,
+                    from_predictions=False,
+                    to_predictions=False,
+                    clear_before=False,
                     layer_names=[self.entity_layer, self.relation_layer],
                     **self.processor_kwargs.get("re_add_gold_data", {}),
                 ),

src/predict.py CHANGED Viewed

@@ -34,14 +34,13 @@ root = pyrootutils.setup_root(
 # ------------------------------------------------------------------------------------ #
 import os
-import timeit
 from collections.abc import Iterable, Sequence
 from typing import Any, Dict, Optional, Tuple, Union
 import hydra
 import pytorch_lightning as pl
 from omegaconf import DictConfig, OmegaConf
-from pie_datasets import Dataset, DatasetDict
 from pie_modules.models import *  # noqa: F403
 from pie_modules.taskmodules import *  # noqa: F403
 from pytorch_ie import Document, Pipeline
@@ -132,38 +131,13 @@ def predict(cfg: DictConfig) -> Tuple[dict, dict]:
         "pipeline": pipeline,
         "serializer": serializer,
     }
-    result: Dict[str, Any] = {}
-    if pipeline is not None:
-        log.info("Starting inference!")
-        prediction_time = 0.0
-    else:
-        log.warning("No prediction pipeline is defined, skip inference!")
-        prediction_time = None
-    document_batch_size = cfg.get("document_batch_size", None)
-    for docs_batch in (
-        document_batch_iter(dataset_predict, document_batch_size)
-        if document_batch_size
-        else [dataset_predict]
-    ):
-        if pipeline is not None:
-            t_start = timeit.default_timer()
-            docs_batch = pipeline(docs_batch, inplace=False)
-            prediction_time += timeit.default_timer() - t_start  # type: ignore
-        # serialize the documents
-        if serializer is not None:
-            # the serializer should not return the serialized documents, but write them to disk
-            # and instead return some metadata such as the path to the serialized documents
-            serializer_result = serializer(docs_batch)
-            if "serializer" in result and result["serializer"] != serializer_result:
-                log.warning(
-                    f"serializer result changed from {result['serializer']} to {serializer_result}"
-                    " during prediction. Only the last result is returned."
-                )
-            result["serializer"] = serializer_result
-    if prediction_time is not None:
-        result["prediction_time"] = prediction_time
     # serialize config with resolved paths
     if cfg.get("config_out_path"):

 # ------------------------------------------------------------------------------------ #
 import os
 from collections.abc import Iterable, Sequence
 from typing import Any, Dict, Optional, Tuple, Union
 import hydra
 import pytorch_lightning as pl
 from omegaconf import DictConfig, OmegaConf
+from pie_datasets import DatasetDict
 from pie_modules.models import *  # noqa: F403
 from pie_modules.taskmodules import *  # noqa: F403
 from pytorch_ie import Document, Pipeline
         "pipeline": pipeline,
         "serializer": serializer,
     }
+    # predict and serialize
+    result: Dict[str, Any] = utils.predict_and_serialize(
+        pipeline=pipeline,
+        serializer=serializer,
+        dataset=dataset_predict,
+        document_batch_size=cfg.get("document_batch_size", None),
+    )
     # serialize config with resolved paths
     if cfg.get("config_out_path"):

src/serializer/interface.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Sequence
 from pytorch_ie.core import Document
@@ -12,5 +12,5 @@ class DocumentSerializer(ABC):
     """
     @abstractmethod
-    def __call__(self, documents: Sequence[Document]) -> Any:
         pass

 from abc import ABC, abstractmethod
+from typing import Any, Iterable
 from pytorch_ie.core import Document
     """
     @abstractmethod
+    def __call__(self, documents: Iterable[Document]) -> Any:
         pass

src/serializer/json.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import json
 import os
-from typing import Dict, List, Optional, Sequence, Type, TypeVar
 from pie_datasets import Dataset, DatasetDict, IterableDataset
 from pie_datasets.core.dataset_dict import METADATA_FILE_NAME
@@ -8,7 +8,7 @@ from pytorch_ie.core import Document
 from pytorch_ie.utils.hydra import resolve_optional_document_type, serialize_document_type
 from src.serializer.interface import DocumentSerializer
-from src.utils import get_pylogger
 log = get_pylogger(__name__)
@@ -31,7 +31,7 @@ class JsonSerializer(DocumentSerializer):
     @classmethod
     def write(
         cls,
-        documents: Sequence[Document],
         path: str,
         file_name: str = "documents.jsonl",
         metadata_file_name: str = METADATA_FILE_NAME,
@@ -42,6 +42,9 @@ class JsonSerializer(DocumentSerializer):
         log.info(f'serialize documents to "{realpath}" ...')
         os.makedirs(realpath, exist_ok=True)
         # dump metadata including the document_type
         if len(documents) == 0:
             raise Exception("cannot serialize empty list of documents")
@@ -130,7 +133,7 @@ class JsonSerializer(DocumentSerializer):
         all_kwargs = {**self.default_kwargs, **kwargs}
         return self.write(**all_kwargs)
-    def __call__(self, documents: Sequence[Document], **kwargs) -> Dict[str, str]:
         return self.write_with_defaults(documents=documents, **kwargs)
@@ -141,12 +144,15 @@ class JsonSerializer2(DocumentSerializer):
     @classmethod
     def write(
         cls,
-        documents: Sequence[Document],
         path: str,
         split: str = "train",
     ) -> Dict[str, str]:
         if not isinstance(documents, (Dataset, IterableDataset)):
-            documents = Dataset.from_documents(documents)
         dataset_dict = DatasetDict({split: documents})
         dataset_dict.to_json(path=path)
         return {"path": path, "split": split}
@@ -175,5 +181,5 @@ class JsonSerializer2(DocumentSerializer):
         all_kwargs = {**self.default_kwargs, **kwargs}
         return self.write(**all_kwargs)
-    def __call__(self, documents: Sequence[Document], **kwargs) -> Dict[str, str]:
         return self.write_with_defaults(documents=documents, **kwargs)

 import json
 import os
+from typing import Dict, Iterable, List, Optional, Sequence, Type, TypeVar
 from pie_datasets import Dataset, DatasetDict, IterableDataset
 from pie_datasets.core.dataset_dict import METADATA_FILE_NAME
 from pytorch_ie.utils.hydra import resolve_optional_document_type, serialize_document_type
 from src.serializer.interface import DocumentSerializer
+from src.utils.logging_utils import get_pylogger
 log = get_pylogger(__name__)
     @classmethod
     def write(
         cls,
+        documents: Iterable[Document],
         path: str,
         file_name: str = "documents.jsonl",
         metadata_file_name: str = METADATA_FILE_NAME,
         log.info(f'serialize documents to "{realpath}" ...')
         os.makedirs(realpath, exist_ok=True)
+        if not isinstance(documents, Sequence):
+            documents = list(documents)
         # dump metadata including the document_type
         if len(documents) == 0:
             raise Exception("cannot serialize empty list of documents")
         all_kwargs = {**self.default_kwargs, **kwargs}
         return self.write(**all_kwargs)
+    def __call__(self, documents: Iterable[Document], **kwargs) -> Dict[str, str]:
         return self.write_with_defaults(documents=documents, **kwargs)
     @classmethod
     def write(
         cls,
+        documents: Iterable[Document],
         path: str,
         split: str = "train",
     ) -> Dict[str, str]:
         if not isinstance(documents, (Dataset, IterableDataset)):
+            if not isinstance(documents, Sequence):
+                documents = IterableDataset.from_documents(documents)
+            else:
+                documents = Dataset.from_documents(documents)
         dataset_dict = DatasetDict({split: documents})
         dataset_dict.to_json(path=path)
         return {"path": path, "split": split}
         all_kwargs = {**self.default_kwargs, **kwargs}
         return self.write(**all_kwargs)
+    def __call__(self, documents: Iterable[Document], **kwargs) -> Dict[str, str]:
         return self.write_with_defaults(documents=documents, **kwargs)

src/start_demo.py CHANGED Viewed

@@ -99,6 +99,7 @@ def main(cfg: DictConfig) -> None:
     render_caption2mode = {v: k for k, v in render_mode2caption.items()}
     default_min_similarity = cfg["default_min_similarity"]
     default_top_k = cfg["default_top_k"]
     layer_caption_mapping = cfg["layer_caption_mapping"]
     relation_name_mapping = cfg["relation_name_mapping"]
@@ -287,6 +288,13 @@ def main(cfg: DictConfig) -> None:
                             step=1,
                             value=default_top_k,
                         )
                         retrieve_similar_adus_btn = gr.Button(
                             "Retrieve *similar* ADUs for *selected* ADU"
                         )
@@ -361,18 +369,23 @@ def main(cfg: DictConfig) -> None:
                         load_pie_dataset_btn = gr.Button("Load & Embed PIE Dataset")
         render_event_kwargs = dict(
-            fn=lambda _retriever, _document_id, _render_as, _render_kwargs, _all_relevant_adus_df, _all_relevant_adus_query_doc_id: render_annotated_document(
-                retriever=_retriever[0],
-                document_id=_document_id,
-                render_with=render_caption2mode[_render_as],
-                render_kwargs_json=_render_kwargs,
-                highlight_span_ids=(
-                    _all_relevant_adus_df["query_span_id"].tolist()
-                    if _document_id == _all_relevant_adus_query_doc_id
-                    else None
-                ),
             ),
             inputs=[
                 retriever_state,
                 selected_document_id,
                 render_as,
@@ -583,10 +596,11 @@ def main(cfg: DictConfig) -> None:
         ).success(**show_stats_kwargs)
         retrieve_relevant_adus_event_kwargs = dict(
-            fn=lambda _retriever, _selected_adu_id, _min_similarity, _top_k: retrieve_relevant_spans(
                 retriever=_retriever[0],
                 query_span_id=_selected_adu_id,
                 k=_top_k,
                 score_threshold=_min_similarity,
                 relation_label_mapping=relation_name_mapping,
                 # columns=relevant_adus.headers
@@ -596,6 +610,7 @@ def main(cfg: DictConfig) -> None:
                 selected_adu_id,
                 min_similarity,
                 top_k,
             ],
             outputs=[relevant_adus_df],
         )
@@ -614,10 +629,11 @@ def main(cfg: DictConfig) -> None:
         ).success(**retrieve_relevant_adus_event_kwargs)
         retrieve_similar_adus_btn.click(
-            fn=lambda _retriever, _selected_adu_id, _min_similarity, _tok_k: retrieve_similar_spans(
                 retriever=_retriever[0],
                 query_span_id=_selected_adu_id,
                 k=_tok_k,
                 score_threshold=_min_similarity,
             ),
             inputs=[
@@ -625,6 +641,7 @@ def main(cfg: DictConfig) -> None:
                 selected_adu_id,
                 min_similarity,
                 top_k,
             ],
             outputs=[similar_adus_df],
         )
@@ -635,10 +652,11 @@ def main(cfg: DictConfig) -> None:
         )
         retrieve_all_similar_adus_btn.click(
-            fn=lambda _retriever, _document_id, _min_similarity, _tok_k: retrieve_all_similar_spans(
                 retriever=_retriever[0],
                 query_doc_id=_document_id,
                 k=_tok_k,
                 score_threshold=_min_similarity,
                 query_span_id_column="query_span_id",
             ),
@@ -647,16 +665,18 @@ def main(cfg: DictConfig) -> None:
                 selected_document_id,
                 min_similarity,
                 top_k,
             ],
             outputs=[all_similar_adus_df],
         )
         retrieve_all_relevant_adus_btn.click(
-            fn=lambda _retriever, _document_id, _min_similarity, _tok_k: (
                 retrieve_all_relevant_spans(
                     retriever=_retriever[0],
                     query_doc_id=_document_id,
                     k=_tok_k,
                     score_threshold=_min_similarity,
                     query_span_id_column="query_span_id",
                     query_span_text_column="query_span_text",
@@ -668,6 +688,7 @@ def main(cfg: DictConfig) -> None:
                 selected_document_id,
                 min_similarity,
                 top_k,
             ],
             outputs=[all_relevant_adus_df, all_relevant_adus_query_doc_id],
         )

     render_caption2mode = {v: k for k, v in render_mode2caption.items()}
     default_min_similarity = cfg["default_min_similarity"]
     default_top_k = cfg["default_top_k"]
+    default_min_score = cfg["default_min_score"]
     layer_caption_mapping = cfg["layer_caption_mapping"]
     relation_name_mapping = cfg["relation_name_mapping"]
                             step=1,
                             value=default_top_k,
                         )
+                        min_score = gr.Slider(
+                            label="Minimum Score",
+                            minimum=0.0,
+                            maximum=1.0,
+                            step=0.01,
+                            value=default_min_score,
+                        )
                         retrieve_similar_adus_btn = gr.Button(
                             "Retrieve *similar* ADUs for *selected* ADU"
                         )
                         load_pie_dataset_btn = gr.Button("Load & Embed PIE Dataset")
         render_event_kwargs = dict(
+            fn=lambda _rendered_output, _retriever, _document_id, _render_as, _render_kwargs, _all_relevant_adus_df, _all_relevant_adus_query_doc_id: (
+                render_annotated_document(
+                    retriever=_retriever[0],
+                    document_id=_document_id,
+                    render_with=render_caption2mode[_render_as],
+                    render_kwargs_json=_render_kwargs,
+                    highlight_span_ids=(
+                        _all_relevant_adus_df["query_span_id"].tolist()
+                        if _document_id == _all_relevant_adus_query_doc_id
+                        else None
+                    ),
+                )
+                if _document_id.strip() != ""
+                else _rendered_output
             ),
             inputs=[
+                rendered_output,
                 retriever_state,
                 selected_document_id,
                 render_as,
         ).success(**show_stats_kwargs)
         retrieve_relevant_adus_event_kwargs = dict(
+            fn=lambda _retriever, _selected_adu_id, _min_similarity, _top_k, _min_score: retrieve_relevant_spans(
                 retriever=_retriever[0],
                 query_span_id=_selected_adu_id,
                 k=_top_k,
+                min_score=_min_score,
                 score_threshold=_min_similarity,
                 relation_label_mapping=relation_name_mapping,
                 # columns=relevant_adus.headers
                 selected_adu_id,
                 min_similarity,
                 top_k,
+                min_score,
             ],
             outputs=[relevant_adus_df],
         )
         ).success(**retrieve_relevant_adus_event_kwargs)
         retrieve_similar_adus_btn.click(
+            fn=lambda _retriever, _selected_adu_id, _min_similarity, _tok_k, _min_score: retrieve_similar_spans(
                 retriever=_retriever[0],
                 query_span_id=_selected_adu_id,
                 k=_tok_k,
+                min_score=_min_score,
                 score_threshold=_min_similarity,
             ),
             inputs=[
                 selected_adu_id,
                 min_similarity,
                 top_k,
+                min_score,
             ],
             outputs=[similar_adus_df],
         )
         )
         retrieve_all_similar_adus_btn.click(
+            fn=lambda _retriever, _document_id, _min_similarity, _tok_k, _min_score: retrieve_all_similar_spans(
                 retriever=_retriever[0],
                 query_doc_id=_document_id,
                 k=_tok_k,
+                min_score=_min_score,
                 score_threshold=_min_similarity,
                 query_span_id_column="query_span_id",
             ),
                 selected_document_id,
                 min_similarity,
                 top_k,
+                min_score,
             ],
             outputs=[all_similar_adus_df],
         )
         retrieve_all_relevant_adus_btn.click(
+            fn=lambda _retriever, _document_id, _min_similarity, _tok_k, _min_score: (
                 retrieve_all_relevant_spans(
                     retriever=_retriever[0],
                     query_doc_id=_document_id,
                     k=_tok_k,
+                    min_score=_min_score,
                     score_threshold=_min_similarity,
                     query_span_id_column="query_span_id",
                     query_span_text_column="query_span_text",
                 selected_document_id,
                 min_similarity,
                 top_k,
+                min_score,
             ],
             outputs=[all_relevant_adus_df, all_relevant_adus_query_doc_id],
         )

src/taskmodules/cross_text_binary_coref_nli.py CHANGED Viewed

@@ -62,6 +62,9 @@ class CrossTextBinaryCorefTaskModuleByNli(RelationStatisticsMixin, TaskModuleTyp
         tokenizer_name_or_path: str,
         labels: List[str],
         entailment_label: str,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -69,6 +72,9 @@ class CrossTextBinaryCorefTaskModuleByNli(RelationStatisticsMixin, TaskModuleTyp
         self.labels = labels
         self.entailment_label = entailment_label
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
     def _post_prepare(self):
@@ -118,9 +124,18 @@ class CrossTextBinaryCorefTaskModuleByNli(RelationStatisticsMixin, TaskModuleTyp
         for task_encoding in task_encodings:
             all_texts.extend(task_encoding.inputs["text"])
             all_texts_pair.extend(task_encoding.inputs["text_pair"])
         inputs = self.tokenizer(
-            text=all_texts,
-            text_pair=all_texts_pair,
             truncation=True,
             padding=True,
             return_tensors="pt",
@@ -159,8 +174,20 @@ class CrossTextBinaryCorefTaskModuleByNli(RelationStatisticsMixin, TaskModuleTyp
         task_encoding: TaskEncoding[DocumentType, InputEncodingType, TargetEncodingType],
         task_output: TaskOutputType,
     ) -> Iterator[Tuple[str, Annotation]]:
-        if all(label == self.entailment_label for label in task_output["label_pair"]):
             probs = task_output["entailment_probability_pair"]
-            score = (probs[0] + probs[1]) / 2
             new_coref_rel = task_encoding.metadata["candidate_annotation"].copy(score=score)
             yield "binary_coref_relations", new_coref_rel

         tokenizer_name_or_path: str,
         labels: List[str],
         entailment_label: str,
+        combine_score_method: str = "average",
+        keep_all_relations: bool = False,
+        as_text_pair: bool = True,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         self.labels = labels
         self.entailment_label = entailment_label
+        self.combine_score_method = combine_score_method
+        self.keep_all_relations = keep_all_relations
+        self.as_text_pair = as_text_pair
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
     def _post_prepare(self):
         for task_encoding in task_encodings:
             all_texts.extend(task_encoding.inputs["text"])
             all_texts_pair.extend(task_encoding.inputs["text_pair"])
+        if self.as_text_pair:
+            text = all_texts
+            text_pair = all_texts_pair
+        else:
+            text = [
+                f"{text}{self.tokenizer.sep_token}{text_pair}"
+                for text, text_pair in zip(all_texts, all_texts_pair)
+            ]
+            text_pair = None
         inputs = self.tokenizer(
+            text=text,
+            text_pair=text_pair,
             truncation=True,
             padding=True,
             return_tensors="pt",
         task_encoding: TaskEncoding[DocumentType, InputEncodingType, TargetEncodingType],
         task_output: TaskOutputType,
     ) -> Iterator[Tuple[str, Annotation]]:
+        if (
+            all(label == self.entailment_label for label in task_output["label_pair"])
+            or self.keep_all_relations
+        ):
             probs = task_output["entailment_probability_pair"]
+            if self.combine_score_method == "average":
+                score = (probs[0] + probs[1]) / 2
+            elif self.combine_score_method == "min":
+                score = min(probs)
+            elif self.combine_score_method == "max":
+                score = max(probs)
+            elif self.combine_score_method == "product":
+                score = probs[0] * probs[1]
+            else:
+                raise ValueError(f"Unsupported combine_score_method: {self.combine_score_method}")
             new_coref_rel = task_encoding.metadata["candidate_annotation"].copy(score=score)
             yield "binary_coref_relations", new_coref_rel

src/train.py CHANGED Viewed

@@ -38,13 +38,14 @@ from typing import Any, Dict, List, Optional, Tuple
 import hydra
 import pytorch_lightning as pl
-from omegaconf import DictConfig
 from pie_datasets import DatasetDict
 from pie_modules.models import *  # noqa: F403
 from pie_modules.models import SimpleGenerativeModel
 from pie_modules.models.interface import RequiresTaskmoduleConfig
 from pie_modules.taskmodules import *  # noqa: F403
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
 from pytorch_ie.core import PyTorchIEModel, TaskModule
 from pytorch_ie.models import *  # noqa: F403
 from pytorch_ie.models.interface import RequiresModelNameOrPath, RequiresNumClasses
@@ -56,6 +57,7 @@ from pytorch_lightning.loggers import Logger
 from src import utils
 from src.datamodules import PieDataModule
 from src.models import *  # noqa: F403
 from src.taskmodules import *  # noqa: F403
 log = utils.get_pylogger(__name__)
@@ -81,6 +83,27 @@ def get_metric_value(metric_dict: dict, metric_name: str) -> Optional[float]:
     return metric_value
 @utils.task_wrapper
 def train(cfg: DictConfig) -> Tuple[dict, dict]:
     """Trains the model. Can additionally evaluate on a testset, using best weights obtained during
@@ -179,6 +202,11 @@ def train(cfg: DictConfig) -> Tuple[dict, dict]:
             )
         additional_model_kwargs["base_model_config"] = base_model_config
     # initialize the model
     model: PyTorchIEModel = hydra.utils.instantiate(
         cfg.model, _convert_="partial", **additional_model_kwargs
@@ -207,9 +235,11 @@ def train(cfg: DictConfig) -> Tuple[dict, dict]:
         log.info("Logging hyperparameters!")
         utils.log_hyperparameters(logger=logger, model=model, taskmodule=taskmodule, config=cfg)
-    if cfg.model_save_dir is not None:
-        log.info(f"Save taskmodule to {cfg.model_save_dir} [push_to_hub={cfg.push_to_hub}]")
-        taskmodule.save_pretrained(save_directory=cfg.model_save_dir, push_to_hub=cfg.push_to_hub)
     else:
         log.warning("the taskmodule is not saved because no save_dir is specified")
@@ -238,15 +268,17 @@ def train(cfg: DictConfig) -> Tuple[dict, dict]:
                 f"Expected format: " + '"epoch_{best_epoch}.ckpt"'
             )
-    if not cfg.trainer.get("fast_dev_run"):
-        if cfg.model_save_dir is not None:
             if best_ckpt_path == "":
                 log.warning("Best ckpt not found! Using current weights for saving...")
             else:
                 model = type(model).load_from_checkpoint(best_ckpt_path)
-            log.info(f"Save model to {cfg.model_save_dir} [push_to_hub={cfg.push_to_hub}]")
-            model.save_pretrained(save_directory=cfg.model_save_dir, push_to_hub=cfg.push_to_hub)
         else:
             log.warning("the model is not saved because no save_dir is specified")
@@ -275,8 +307,36 @@ def train(cfg: DictConfig) -> Tuple[dict, dict]:
     # add model_save_dir to the result so that it gets dumped to job_return_value.json
     # if we use hydra_callbacks.SaveJobReturnValueCallback
-    if cfg.get("model_save_dir") is not None:
-        metric_dict["model_save_dir"] = cfg.model_save_dir
     return metric_dict, object_dict
@@ -301,4 +361,5 @@ def main(cfg: DictConfig) -> Optional[float]:
 if __name__ == "__main__":
     utils.replace_sys_args_with_values_from_files()
     utils.prepare_omegaconf()
     main()

 import hydra
 import pytorch_lightning as pl
+from omegaconf import DictConfig, OmegaConf
 from pie_datasets import DatasetDict
 from pie_modules.models import *  # noqa: F403
 from pie_modules.models import SimpleGenerativeModel
 from pie_modules.models.interface import RequiresTaskmoduleConfig
 from pie_modules.taskmodules import *  # noqa: F403
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
+from pytorch_ie import Pipeline
 from pytorch_ie.core import PyTorchIEModel, TaskModule
 from pytorch_ie.models import *  # noqa: F403
 from pytorch_ie.models.interface import RequiresModelNameOrPath, RequiresNumClasses
 from src import utils
 from src.datamodules import PieDataModule
 from src.models import *  # noqa: F403
+from src.serializer.interface import DocumentSerializer
 from src.taskmodules import *  # noqa: F403
 log = utils.get_pylogger(__name__)
     return metric_value
+def flatten_nested_dict(d: Dict[str, Any], parent_key: str = "", sep: str = ".") -> Dict[str, Any]:
+    """Flatten a nested dictionary.
+    Args:
+        d (Dict[str, Any]): The dictionary to flatten.
+        parent_key (str): The parent key.
+        sep (str): The separator.
+    Returns:
+        Dict[str, Any]: The flattened dictionary.
+    """
+    items: List[Tuple[str, Any]] = []
+    for k, v in d.items():
+        new_key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, dict):
+            items.extend(flatten_nested_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
 @utils.task_wrapper
 def train(cfg: DictConfig) -> Tuple[dict, dict]:
     """Trains the model. Can additionally evaluate on a testset, using best weights obtained during
             )
         additional_model_kwargs["base_model_config"] = base_model_config
+    if issubclass(model_cls, SimpleSequenceClassificationModelWithInputTypeIds):  # noqa: F405
+        # add the number of input type ids to the model:
+        # 2 for B- and I-labels for each entity type, 1 for O labels, 1 for padding
+        additional_model_kwargs["num_token_type_ids"] = len(taskmodule.entity_labels) * 2 + 1 + 1
     # initialize the model
     model: PyTorchIEModel = hydra.utils.instantiate(
         cfg.model, _convert_="partial", **additional_model_kwargs
         log.info("Logging hyperparameters!")
         utils.log_hyperparameters(logger=logger, model=model, taskmodule=taskmodule, config=cfg)
+    if cfg.paths.model_save_dir is not None:
+        log.info(f"Save taskmodule to {cfg.paths.model_save_dir} [push_to_hub={cfg.push_to_hub}]")
+        taskmodule.save_pretrained(
+            save_directory=cfg.paths.model_save_dir, push_to_hub=cfg.push_to_hub
+        )
     else:
         log.warning("the taskmodule is not saved because no save_dir is specified")
                 f"Expected format: " + '"epoch_{best_epoch}.ckpt"'
             )
+    if not cfg.trainer.get("fast_dev_run") or cfg.get("predict", False):
+        if cfg.paths.model_save_dir is not None:
             if best_ckpt_path == "":
                 log.warning("Best ckpt not found! Using current weights for saving...")
             else:
                 model = type(model).load_from_checkpoint(best_ckpt_path)
+            log.info(f"Save model to {cfg.paths.model_save_dir} [push_to_hub={cfg.push_to_hub}]")
+            model.save_pretrained(
+                save_directory=cfg.paths.model_save_dir, push_to_hub=cfg.push_to_hub
+            )
         else:
             log.warning("the model is not saved because no save_dir is specified")
     # add model_save_dir to the result so that it gets dumped to job_return_value.json
     # if we use hydra_callbacks.SaveJobReturnValueCallback
+    if cfg.paths.get("model_save_dir") is not None:
+        metric_dict["model_save_dir"] = cfg.paths.model_save_dir
+    if cfg.get("predict"):
+        # Init the inference pipeline
+        pipeline: Optional[Pipeline] = None
+        if cfg.get("pipeline") and cfg.pipeline.get("_target_"):
+            log.info(f"Instantiating inference pipeline <{cfg.pipeline._target_}>")
+            pipeline = hydra.utils.instantiate(cfg.pipeline, _convert_="partial")
+        # Init the serializer
+        serializer: Optional[DocumentSerializer] = None
+        if cfg.get("serializer") and cfg.serializer.get("_target_"):
+            log.info(f"Instantiating serializer <{cfg.serializer._target_}>")
+            serializer = hydra.utils.instantiate(cfg.serializer, _convert_="partial")
+        # predict and serialize
+        predict_metrics: Dict[str, Any] = utils.predict_and_serialize(
+            pipeline=pipeline,
+            serializer=serializer,
+            dataset=dataset[cfg.dataset_split],
+            document_batch_size=cfg.get("document_batch_size", None),
+        )
+        # flatten the predict_metrics dict
+        predict_metrics_flat = flatten_nested_dict(predict_metrics, sep="/")
+        metric_dict.update(predict_metrics_flat)
+    if cfg.get("delete_model_dir"):
+        import shutil
+        log.info(f"Deleting model directory {cfg.paths.model_save_dir}")
+        shutil.rmtree(cfg.paths.model_save_dir)
     return metric_dict, object_dict
 if __name__ == "__main__":
     utils.replace_sys_args_with_values_from_files()
     utils.prepare_omegaconf()
+    OmegaConf.register_new_resolver("eval", eval)
     main()

src/utils/__init__.py CHANGED Viewed

@@ -5,7 +5,8 @@ from .config_utils import (
     prepare_omegaconf,
 )
 from .data_utils import download_and_unzip, filter_dataframe_and_get_column
 from .logging_utils import close_loggers, get_pylogger, log_hyperparameters
 from .rich_utils import enforce_tags, print_config_tree
-from .span_utils import distance
 from .task_utils import extras, replace_sys_args_with_values_from_files, save_file, task_wrapper

     prepare_omegaconf,
 )
 from .data_utils import download_and_unzip, filter_dataframe_and_get_column
+from .inference_utils import predict_and_serialize
 from .logging_utils import close_loggers, get_pylogger, log_hyperparameters
 from .rich_utils import enforce_tags, print_config_tree
+from .span_utils import distance, distance_slices
 from .task_utils import extras, replace_sys_args_with_values_from_files, save_file, task_wrapper

src/utils/inference_utils.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import timeit
+from collections.abc import Iterable, Sequence
+from typing import Any, Dict, Optional, Union
+from pytorch_ie import Document, Pipeline
+from src.serializer.interface import DocumentSerializer
+from .logging_utils import get_pylogger
+log = get_pylogger(__name__)
+def document_batch_iter(
+    dataset: Iterable[Document], batch_size: int
+) -> Iterable[Sequence[Document]]:
+    if isinstance(dataset, Sequence):
+        for i in range(0, len(dataset), batch_size):
+            yield dataset[i : i + batch_size]
+    elif isinstance(dataset, Iterable):
+        docs = []
+        for doc in dataset:
+            docs.append(doc)
+            if len(docs) == batch_size:
+                yield docs
+                docs = []
+        if docs:
+            yield docs
+    else:
+        raise ValueError(f"Unsupported dataset type: {type(dataset)}")
+def predict_and_serialize(
+    pipeline: Optional[Pipeline],
+    serializer: Optional[DocumentSerializer],
+    dataset: Iterable[Document],
+    document_batch_size: Optional[int] = None,
+) -> Dict[str, Any]:
+    result: Dict[str, Any] = {}
+    if pipeline is not None:
+        log.info("Starting inference!")
+        prediction_time = 0.0
+    else:
+        log.warning("No prediction pipeline is defined, skip inference!")
+        prediction_time = None
+    docs_batch: Union[Iterable[Document], Sequence[Document]]
+    batch_iter: Union[Sequence[Iterable[Document]], Iterable[Sequence[Document]]]
+    if document_batch_size is None:
+        batch_iter = [dataset]
+    else:
+        batch_iter = document_batch_iter(dataset=dataset, batch_size=document_batch_size)
+    for docs_batch in batch_iter:
+        if pipeline is not None:
+            t_start = timeit.default_timer()
+            docs_batch = pipeline(docs_batch, inplace=False)
+            prediction_time += timeit.default_timer() - t_start  # type: ignore
+        # serialize the documents
+        if serializer is not None:
+            # the serializer should not return the serialized documents, but write them to disk
+            # and instead return some metadata such as the path to the serialized documents
+            serializer_result = serializer(docs_batch)
+            if "serializer" in result and result["serializer"] != serializer_result:
+                log.warning(
+                    f"serializer result changed from {result['serializer']} to {serializer_result}"
+                    " during prediction. Only the last result is returned."
+                )
+            result["serializer"] = serializer_result
+    if prediction_time is not None:
+        result["prediction_time"] = prediction_time
+    return result

src/utils/span_utils.py CHANGED Viewed

@@ -58,3 +58,17 @@ def distance(
         raise ValueError(
             f"unknown distance_type={distance_type}. use one of: center, inner, outer"
         )

         raise ValueError(
             f"unknown distance_type={distance_type}. use one of: center, inner, outer"
         )
+def distance_slices(
+    slices: Tuple[Tuple[int, int], ...],
+    other_slices: Tuple[Tuple[int, int], ...],
+    distance_type: str,
+) -> float:
+    starts, ends = zip(*slices)
+    other_starts, other_ends = zip(*other_slices)
+    return distance(
+        start_end=(min(starts), max(ends)),
+        other_start_end=(min(other_starts), max(other_ends)),
+        distance_type=distance_type,
+    )