Skip to content

Bog-of-Entity metrics

ie_eval.metrics.bag_of_entities

Compute the bag-of-words/bag-of-tagged-words/bag-of-entities metrics from a label/prediction dataset.

Attributes

logger module-attribute

logger = getLogger(__name__)

Classes

WordType

Bases: Enum

Word Type.

Attributes
word class-attribute instance-attribute
word = 'word'
tagged_word class-attribute instance-attribute
tagged_word = 'tagged_word'
entity class-attribute instance-attribute
entity = 'entity'

Functions

prepare

prepare(
    document: Document,
    attr_name: str,
    with_category: bool = False,
)

Get list of words, tagged words or entities (overall and by category).

Parameters:

Name Type Description Default
document Document

Processed document

required
attr_name str

Name of the attribute which holds the objects to store

required
with_category bool

Store the category along the word. Defaults to False.

False
Source code in ie_eval/metrics/bag_of_entities.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def prepare(document: Document, attr_name: str, with_category: bool = False):
    """Get list of words, tagged words or entities (overall and by category).

    Args:
        document (Document): Processed document
        attr_name (str): Name of the attribute which holds the objects to store
        with_category (bool, optional): Store the category along the word. Defaults to False.
    """
    prepared_data = defaultdict(list)
    for category, text in getattr(document, attr_name):
        data = (category, text) if with_category else text
        prepared_data[category].append(data)
        prepared_data[GLOBAL_STAT_NAME].append(data)
    return prepared_data

tokenize_entities

tokenize_entities(
    label: Document,
    prediction: Document,
    word_type: WordType,
) -> tuple[
    dict[str, list[str | tuple[str, str]]],
    dict[str, list[str | tuple[str, str]]],
]

Prepare label and prediction for BagOfWord computation.

Parameters:

Name Type Description Default
label Document

the label document

required
prediction Document

the prediction document

required
word_type WordType

Whether to consider a list of words, list tagged words, or list of tagged entities.

required

Returns:

Type Description
dict[str, list[str | tuple[str, str]]]

a label dictionary with categories as keys and corresponding list of words, tagged words or tagged entities as values.

dict[str, list[str | tuple[str, str]]]

a prediction dictionary with categories as keys and corresponding list of words, tagged words or tagged entities as values.

Source code in ie_eval/metrics/bag_of_entities.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def tokenize_entities(
    label: Document,
    prediction: Document,
    word_type: WordType,
) -> tuple[
    dict[str, list[str | tuple[str, str]]],
    dict[str, list[str | tuple[str, str]]],
]:
    """Prepare label and prediction for BagOfWord computation.

    Args:
        label (Document): the label document
        prediction (Document): the prediction document
        word_type (WordType): Whether to consider a list of words, list tagged words, or list of tagged entities.

    Returns:
        a label dictionary with categories as keys and corresponding list of words, tagged words or tagged entities as values.
        a prediction dictionary with categories as keys and corresponding list of words, tagged words or tagged entities as values.
    """
    kwargs = {}
    match word_type:
        case WordType.word:
            # Return list of words
            kwargs = {"attr_name": "word_entities"}
        case WordType.tagged_word:
            # Return list of tagged words
            kwargs = {"attr_name": "word_entities", "with_category": True}
        case WordType.entity:
            # Return list of tagged entities
            kwargs = {"attr_name": "entities", "with_category": True}
    return (prepare(document=label, **kwargs), prepare(document=prediction, **kwargs))

compute_bag_of_anything

compute_bag_of_anything(
    dataset: list[tuple[Document, Document]],
    by_category: bool = False,
    word_type: WordType = WordType.word,
    print_table: bool = True,
) -> PrettyTable

Compute bag-of-words, bag-of-tagged-words, or bag-of-entities.

Parameters:

Name Type Description Default
dataset list[tuple[Document, Document]]

a dataset containing a list of tuple with the label and corresponding prediction.

required
word_type WordType

Type of words to use for bag-of-words computation.

word
by_category bool

Whether to compute the metric globally or for each category. Defaults to False.

False
print_table bool

Whether to print the table. Defaults to True.

True

Returns:

Name Type Description
PrettyTable PrettyTable

The evaluation table formatted in Markdown.

Source code in ie_eval/metrics/bag_of_entities.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def compute_bag_of_anything(
    dataset: list[tuple[Document, Document]],
    by_category: bool = False,
    word_type: WordType = WordType.word,
    print_table: bool = True,
) -> PrettyTable:
    """Compute bag-of-words, bag-of-tagged-words, or bag-of-entities.

    Args:
        dataset (list[tuple[Document, Document]]): a dataset containing a list of tuple with the label and corresponding prediction.
        word_type (WordType): Type of words to use for bag-of-words computation.
        by_category (bool, optional): Whether to compute the metric globally or for each category. Defaults to False.
        print_table (bool, optional): Whether to print the table. Defaults to True.

    Returns:
        PrettyTable: The evaluation table formatted in Markdown.
    """
    # Initialize scores
    total_score_f1 = MicroAverageFScore()
    total_score_err = MicroAverageErrorRate()

    # Iterate over the dataset
    for label, prediction in dataset:
        # Compute scores
        categories = {entity for entity, _word in label.word_entities}
        label_list, prediction_list = tokenize_entities(
            label,
            prediction,
            word_type=word_type,
        )

        score = BagOfWords(
            labels=label_list[GLOBAL_STAT_NAME],
            predictions=prediction_list[GLOBAL_STAT_NAME],
        )

        # Micro average
        total_score_f1.update(GLOBAL_STAT_NAME, score)
        total_score_err.update(GLOBAL_STAT_NAME, score)

        # Compute bag-of-tagged words by category
        if not by_category:
            continue

        for category in categories:
            # Compute scores
            category_score = BagOfWords(
                label_list.get(category, []),
                prediction_list.get(category, []),
            )

            # Micro average
            total_score_err.update(category, category_score)
            total_score_f1.update(category, category_score)

    # Format and display results
    table = make_bag_of_entities_prettytable(
        errors=total_score_err,
        detections=total_score_f1,
    )
    if print_table:
        print(table)  # noqa: T201
    return table

compute_bag_of_words

compute_bag_of_words(
    label_dir: Path,
    prediction_dir: Path,
    by_category: bool = False,
) -> PrettyTable

Compute bag-of-words.

Parameters:

Name Type Description Default
label_dir Path

Path to the label directory.

required
prediction_dir Path

Path to the prediction directory.

required
by_category bool

Whether to compute the metric globally or for each category. Defaults to False.

False

Returns:

Name Type Description
PrettyTable PrettyTable

The evaluation table formatted in Markdown.

Source code in ie_eval/metrics/bag_of_entities.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def compute_bag_of_words(
    label_dir: Path,
    prediction_dir: Path,
    by_category: bool = False,
) -> PrettyTable:
    """Compute bag-of-words.

    Args:
        label_dir (Path): Path to the label directory.
        prediction_dir (Path): Path to the prediction directory.
        by_category (bool, optional): Whether to compute the metric globally or for each category. Defaults to False.

    Returns:
        PrettyTable: The evaluation table formatted in Markdown.
    """
    return compute_bag_of_anything(
        load_dataset(label_dir=label_dir, prediction_dir=prediction_dir),
        by_category=by_category,
        word_type=WordType.word,
    )

compute_bag_of_tagged_words

compute_bag_of_tagged_words(
    label_dir: Path,
    prediction_dir: Path,
    by_category: bool = False,
) -> PrettyTable

Compute bag-of-tagged-words.

Parameters:

Name Type Description Default
label_dir Path

Path to the label directory.

required
prediction_dir Path

Path to the prediction directory.

required
by_category bool

Whether to compute the metric globally or for each category. Defaults to False.

False

Returns:

Name Type Description
PrettyTable PrettyTable

The evaluation table formatted in Markdown.

Source code in ie_eval/metrics/bag_of_entities.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
def compute_bag_of_tagged_words(
    label_dir: Path,
    prediction_dir: Path,
    by_category: bool = False,
) -> PrettyTable:
    """Compute bag-of-tagged-words.

    Args:
        label_dir (Path): Path to the label directory.
        prediction_dir (Path): Path to the prediction directory.
        by_category (bool, optional): Whether to compute the metric globally or for each category. Defaults to False.

    Returns:
        PrettyTable: The evaluation table formatted in Markdown.
    """
    return compute_bag_of_anything(
        load_dataset(label_dir=label_dir, prediction_dir=prediction_dir),
        by_category=by_category,
        word_type=WordType.tagged_word,
    )

compute_bag_of_entities

compute_bag_of_entities(
    label_dir: Path,
    prediction_dir: Path,
    by_category: bool = False,
) -> PrettyTable

Compute bag-of-entities.

Parameters:

Name Type Description Default
label_dir Path

Path to the label directory.

required
prediction_dir Path

Path to the prediction directory.

required
by_category bool

Whether to compute the metric globally or for each category. Defaults to False.

False

Returns:

Name Type Description
PrettyTable PrettyTable

The evaluation table formatted in Markdown.

Source code in ie_eval/metrics/bag_of_entities.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def compute_bag_of_entities(
    label_dir: Path,
    prediction_dir: Path,
    by_category: bool = False,
) -> PrettyTable:
    """Compute bag-of-entities.

    Args:
        label_dir (Path): Path to the label directory.
        prediction_dir (Path): Path to the prediction directory.
        by_category (bool, optional): Whether to compute the metric globally or for each category. Defaults to False.

    Returns:
        PrettyTable: The evaluation table formatted in Markdown.
    """
    return compute_bag_of_anything(
        load_dataset(label_dir=label_dir, prediction_dir=prediction_dir),
        by_category=by_category,
        word_type=WordType.entity,
    )