Module gender_render.gender_nouns

Functions to gender nouns with gender bias correctly.

Please note that you need to have nltk installed for this to work completely, e.g. to include all types of warnings.

Expand source code Browse git
"""
Functions to gender nouns with gender bias correctly.

Please note that you need to have `nltk` installed for this to work completely, e.g. to include all types of warnings.
"""

import requests
import json
import copy
import warnings as builtin_warnings
import os
from typing import Set, Optional, Dict, List, Callable, Tuple, Union
try:
    from typing_extensions import TypedDict, Literal
except ImportError:
    TypedDict = dict
    Literal = None

from . import warnings

# make some type definitions for the data we read from the web:


if Literal:
    OriginalDataGender = Literal["f", "m", "n", "o"]
else:
    OriginalDataGender = str


class OriginalDataGenderedVersionInfo(TypedDict):
    parts_of_speech: Optional[str]
    word: str


class OriginalDataGenderMapping(TypedDict):
    f: Optional[List[OriginalDataGenderedVersionInfo]]
    m: Optional[List[OriginalDataGenderedVersionInfo]]
    n: Optional[List[OriginalDataGenderedVersionInfo]]
    h: Optional[List[OriginalDataGenderedVersionInfo]]


class OriginalDataWord(TypedDict):
    word: str
    wordnet_senseno: Optional[str]
    gender: OriginalDataGender
    gender_map: Optional[OriginalDataGenderMapping]


OriginalGenderNounData = List[OriginalDataWord]

# make some type definitions for the data we create:


if Literal:
    GeneratedDataGender = Literal["female", "male", "neutral"]
else:
    GeneratedDataGender = str


class GeneratedDataGenderMapping(TypedDict):
    neutral: Optional[str]
    male: Optional[str]
    female: Optional[str]


class GeneratedDataWord(TypedDict):
    gender: str
    gender_map: GeneratedDataGenderMapping
    warning: Optional[Set[str]]


GeneratedGenderNounData = Dict[str, GeneratedDataWord]

# functions to check for the validity of (gendered) words/nouns:

try:
    import nltk
    try:
        nltk.data.find("corpora/words")
        nltk.data.find("corpora/wordnet")
    except LookupError:
        builtin_warnings.warn("nltk corpus (words and/or wordnet) not found; downloading it since it is small.")
        nltk.download('words', quiet=True)
        nltk.download('wordnet', quiet=True)
    from nltk.corpus import wordnet as wn
    person_synsets = set(wn.synsets("person")) | set(wn.synsets("people"))
    english_person_nouns = (
        set([w.lower() for p in person_synsets for s in p.closure(lambda s: s.hyponyms()) for w in s.lemma_names()]))
    english_nouns = set(w.name().split(".")[0].lower() for w in wn.all_synsets("n")) | english_person_nouns
    english_vocab = set(w.lower() for w in nltk.corpus.words.words()) | english_nouns

    def is_a_word(word: str) -> bool:
        """Checks whether the given word is a valid english word."""
        return set(word.lower().split("_")).issubset(english_vocab) or word.lower() in english_vocab

    def is_a_noun(word: str) -> bool:
        """Checks whether the given word is a valid english noun."""
        return set(word.lower().split("_")).issubset(english_nouns) or word.lower() in english_nouns

    def is_a_person_noun(word: str) -> bool:
        """Checks whether the given word is a valid english person noun."""
        return set(word.lower().split("_")).issubset(english_person_nouns) or word.lower() in english_person_nouns
        # ^ this function is not used anymore, but remains for purposes of testing, completeness and developement

except ImportError:
    builtin_warnings.warn("The nltk-module is not installed. Some types of helpful hints and warnings may not be "
                          + "raised, but otherwise, this is not an issue.")

    def is_a_word(word: str) -> bool:
        """Checks whether the given word is a valid english word."""
        return True

    def is_a_noun(word: str) -> bool:
        """Checks whether the given word is a valid english noun."""
        return True

    def is_a_person_noun(word: str) -> bool:
        """Checks whether the given word is a valid english person noun."""
        return True

# a helper function for logging:


def lwarn(*text, sep=" ", end="\n"):
    """Prints the given text, but only if the "BuildingGenderedNounDataLogging"-warning is enabled.
    The name "lwarn" is supposed to stand for "log warning"."""
    warnings.WarningManager.raise_warning(sep.join([str(t) for t in text]) + end,
                                          warnings.BuildingGenderedNounDataLogging)

# a pipeline for creating files that describe differently gendered versions of gendered nouns:


class GenderNounDataHandler:
    """Bundles several static methods to handle and create data that describes the differently gendered versions of
    gendered nouns.
    These methods form a pipeline used for creating a full linked graph of gendered nouns.
    All methods of the pipeline print extensive logs if the corresponding "warning" is enabled, and may or may not
    modify their input in-place.
    Please note that this pipeline is tested only in environments with nltk installed."""

    @staticmethod
    def load_from_web() -> GeneratedGenderNounData:
        """Creates a JSON object describing the differently gendered versions of every gendered noun.
        Word with no wordnet_senseno-attribute (that are not from wordnet) are ignored, and words with "other" as gender
        are re-gendered as "neutral".

        The data used for this is taken from https://github.com/ecmonsen/gendered_words
        (which is not by me; see the repository for the license)."""

        grammatical_genders = {"m": "male", "f": "female", "n": "neutral"}

        # load from the web, in a completely wrong format that we have yet to change:
        raw_json: OriginalGenderNounData = json.loads(requests.get(
            "https://raw.githubusercontent.com/phseiff/gendered_words/master/gendered_words.json").text)

        # change the format:
        result = dict()
        for word in raw_json:
            if "wordnet_senseno" in word:
                result[word["word"]] = {"gender_map": dict()}
                if "gender_map" in word:
                    for short, long in grammatical_genders.items():
                        if short in word["gender_map"]:
                            result[word["word"]]["gender_map"][long] = word["gender_map"][short][0]["word"].replace(
                                " ", "_")
                if word["gender"] == "o":
                    lwarn("Found an \"other\"-word! It's \"" + word["word"] + "\".")
                    result[word["word"]]["gender"] = "neutral"
                else:
                    result[word["word"]]["gender"] = grammatical_genders[word["gender"]]
            else:
                lwarn("\"" + word["word"] + "\" ignored because it is not part of wordnet and therefore not a hyponyms "
                      + "for a person.")

        lwarn(len(result), "words found.")
        return result

    @staticmethod
    def load_from_disk(file_name: str) -> (GeneratedGenderNounData, Dict[str, str]):
        """Loads the gendered nouns from a piece of json data contained in the given file.
        The json object in this file must have a data-attribute, which contains the actual gendered noun data, and may
        contain an arbitrary amount of meta data in the form of strings mapped to strings, which will be returned as a
        dict as the second return value.
        The first return value is the actual gendered noun data, with the only change being that the
        `warnings`-attribute of every noun is converted from a list to a set.
        This assumes the gendered noun data to be of the same type as the data used throughout all the other methods
        of this class, NOT the format of the repository from which `load_from_web` loads its data."""
        with open(file_name, "r") as f:
            code = f.read()
        meta_data = json.loads(code)
        actual_data = meta_data["data"]
        del meta_data["data"]

        for word_data in actual_data.values():
            if "warning" in word_data:
                word_data["warning"] = set(word_data["warning"])
        return actual_data, meta_data

    @staticmethod
    def save_to_disk(graph: GeneratedGenderNounData, file_name: str, **meta_data: Dict[str, str]) -> None:
        """Saves the given gendered nouns data to the given file as json. The data is saved without any modifications,
        except for the `warnings`-attribute of all noun data, which is converted from a set to a list to be compatible
        with standard json format, in the data-attribute of the resulting json file.
        `meta_data` may be used to describe additional meta data in the form of strings of the resulting file.
        This assumes the given data to be of the same type as the data used throughout all the other methods of this
        class, NOT the format of the repository from which `load_from_web` loads its data.
        The data saved with this method can be read again with `load_from_disk`, so that `save_to_disk(data, file_name)`
        implies `load_from_disk(file_name) == data`.
        This method is guaranteed to not change the given data in-place."""
        graph_copy: dict = copy.deepcopy(graph)
        for word_data in graph_copy.values():
            if "warning" in word_data:
                word_data["warning"] = list(word_data["warning"])
                word_data["warning"].sort()
        resulting_data: dict = copy.deepcopy(meta_data)
        resulting_data["data"] = graph_copy
        with open(file_name, "w") as f:
            json.dump(resulting_data, f, indent=4, sort_keys=True)

    @staticmethod
    def remove_words_that_are_not_nouns(graph: GeneratedGenderNounData) -> GeneratedGenderNounData:
        """Removes all elements that are not nouns from the graph. Words that are not nouns, but whose gendered versions
        contain nouns for some reason, are not purged by this.
        Returns the result.
        This step should usually do nothing, since all words that are not nouns should already be filtered out when
        the data is read via load_from_web, but this uses an undocumented feature of the online data set it uses, so
        this function ensures that changes of that feature don't break this code.

        This step may or may not change the given object in-place."""

        grammatical_genders = ["male", "female", "neutral"]

        count = 0
        for word_name, word_data in list(graph.items()):
            is_noun = is_a_noun(word_name)
            for grammatical_gender in grammatical_genders:
                if grammatical_gender in word_data["gender_map"]:
                    is_noun = is_noun or is_a_noun(word_data["gender_map"][grammatical_gender])
            if not is_noun:
                lwarn("Deleting \"" + word_name + "\", since it is not a noun!")
                count += 1
                del graph[word_name]

        lwarn(count, "words deleted.")
        return graph

    @staticmethod
    def make_sure_all_referenced_words_exist(graph: GeneratedGenderNounData) -> GeneratedGenderNounData:
        """Returns a version of the graph where every word linked as a differently gendered version of a word exists."""

        count = 0
        for word_name, word_data in list(graph.items()):
            for gender_name, link_name in word_data["gender_map"].items():
                if link_name not in graph:
                    lwarn("\"" + word_name + "\" lists \"" + link_name + "\" as its " + gender_name + " version, but \""
                          + link_name + "\" does not exist in the word data file.")
                    count += 1
                    graph[link_name] = {"gender": gender_name, "gender_map": {word_data["gender"]: word_name}}
                # # Commented out, since it is already covered by create_extra_links_to_gender_ambiguous_words():
                # if graph[link_name]["gender"] != gender_name:
                #     lwarn("\"" + link_name + "\" is \"" + word_name + "\"s " + gender_name + "s version, but is not "
                #           + gender_name + ".")

        lwarn(count, "new words created.")
        return graph

    @staticmethod
    def choose_better_word(option1, option2, log=False):
        """There are cases where a word has two different gendered versions for the same gender;
        for example, "foremother" and "ancestress" are both female words for an ancestor. In these cases, the "better"
        one will be chosen according to this pipeline.
        This is relevant for `make_all_links_two_sided`, for example (see docstring).

        If `log` is set, having to resort to alphabetical decisions is logged."""

        # ToDo: Add further rules to this, or better yet, make manual (no automated data!) PRs to
        #  https://github.com/phseiff/gendered-words to clear up all these unnecessary unclearnesses.
        bad_things_in_order_of_badness: List[Callable[[str], int]] = [
            # the sooner the lambda, the worse it is:
            lambda word: len([c for c in word if c == "_"]),
            lambda word: int(word.endswith("man") or word.endswith("woman")),
            lambda word: int(word.endswith("person")),
            lambda word: int(word.endswith("mother") or word.endswith("father")),
            lambda word: int(word.endswith("parent"))
        ]
        for badness_meter in bad_things_in_order_of_badness:
            if badness_meter(option1) > badness_meter(option2):
                return option2
            elif badness_meter(option2) > badness_meter(option1):
                return option1
        # otherwise, return the alphabetically first one:
        if log:
            lwarn("Had to alphabetically decide between", option1, "and", option2)
        return sorted([option1, option2])[0]

    @staticmethod
    def make_all_links_two_sided(graph: GeneratedGenderNounData, log_clashs=False)\
            -> GeneratedGenderNounData:
        """Returns a version of the graph where every word linked to links back to the word linking to it, if this words
        gender is not yet in its gender mapping.
        This also goes for triangles where a links to b and c, but b and c are not linked.

        If `log_clashs` is set to True, clashes are logged (these are cases where there are multiple versions available
        for the gendered version of a word). This should only be enabled before
        `create_gendered_versions_for_words_that_end_with_gender_indicators` is called on the data for the first time,
        since afterwards, multiple words might be connected that wheren't conntected before. One example would be that
        "wonder_woman" and "wonder_girl" are both considered female, and both will get "wonder_person" assigned as their
        neutral version by `create_gendered_versions_for_words_that_end_with_gender_indicators`. This is not an issue
        since one of them already links to "wonder_boy", and the other one to "wonder_man" from the beginning, but
        they will still be considered clashing by the algorithm, so calling `make_all_links_two_sided` with
        `log_clashs` set to True after applying `create_gendered_versions_for_words_that_end_with_gender_indicators` to
         the data will create logs for two words that don't clash more than linguistically necessary.

         If `infect_warnings` isn't explicitely set to False, all warnings added to a word are "passed" to every word
         it's linked to."""

        count = 0

        # makes sure we do not report a clash twice.
        already_reported_clashes: Set[str] = set()

        def assign_value_or_use_old_one(gender_dict, gender, value):
            """Assigns a gendered version (`value`) to the given `gender` in the `gender_dict`.
            If there is already a value for the given gender, a decision is made whether to replace the old value in the
            gender dict with the newfound alternative one, and we then log the decision if `log_clashs` is set, and
            collect information required to build a warning to attach to the word to indicate that there was an
            algorithmic choose between two options."""
            if gender in gender_dict and gender_dict[gender] != value:
                # log the clash if it wasn't logged yet:
                clash_report = value + " clashes with " + gender_dict[gender]
                if log_clashs and clash_report not in already_reported_clashes:
                    lwarn(clash_report)
                    already_reported_clashes.add(clash_report)

                # choose the better of both options (keep the old word or rather use the new one):
                chosen_option = GenderNounDataHandler.choose_better_word(value, gender_dict[gender], log=log_clashs)
            else:
                chosen_option = value

            gender_dict[gender] = chosen_option
            gender_dict_alts[gender] |= {chosen_option, value}

        def get_all_connected_words(word_name, data: Union[Set[str], None] = None):
            """Returns a set of all words that are indirectly or directly linked to the given word."""
            if not data:
                data = {word_name}
            for link_name in graph[word_name]["gender_map"].values():
                if link_name not in data:
                    data.add(link_name)
                    get_all_connected_words(link_name, data)
                if link_name in link_groups:
                    data |= link_groups[link_name]
            return data

        link_groups: Dict[str, Set[str]] = dict()
        words_we_already_visited: Set[str] = set()
        # first iteration is to make FULL links groups of connected words before we start linking words to each other:
        for word_name, word_data in graph.items():
            if word_name not in words_we_already_visited:

                # create group of all words that are linked to this word (including those that are once removed):
                link_group = get_all_connected_words(word_name)
                for link_name in link_group:
                    if link_name in link_groups:
                        link_groups[link_name] |= link_group
                    else:
                        link_groups[link_name] = link_group

                # add these words to the visited words now:
                words_we_already_visited |= link_group

        # second iteration is to actually do the linking:
        words_we_already_visited: Set[str] = set()
        for word_name in graph.keys():
            # if we did not visit this word yet:
            if word_name not in words_we_already_visited:
                link_group = link_groups[word_name]
                words_we_already_visited |= link_group

                # create a gender-dict for all of them:
                genders_we_are_very_sure_about: Set[GeneratedDataGender] = set()
                gender_dict: Dict[GeneratedDataGender, str] = dict()
                gender_dict_alts: Dict[GeneratedDataGender, Set[str]] = {"female": set(), "male": set(),
                                                                         "neutral": set()}
                for link_name in link_group:
                    # add gender of the individual...
                    gender = graph[link_name]["gender"]
                    if gender in genders_we_are_very_sure_about:
                        assign_value_or_use_old_one(gender_dict, gender, link_name)
                    else:
                        gender_dict_alts[gender] = {link_name}
                        gender_dict[gender] = link_name
                    genders_we_are_very_sure_about.add(gender)

                    # ...as well as genders it specifies in its gender_map, but only if we couldn't find these otherwise
                    for gender, link_name2 in graph[link_name]["gender_map"].items():
                        if gender not in genders_we_are_very_sure_about:
                            assign_value_or_use_old_one(gender_dict, gender, link_name2)

                # link words with each other, so each word gets the full gender dict to call its own,
                # but only for genders it does not have in its own original gender dict yet, so manual links in the
                # already given data take precedence:
                for link_name in link_group:
                    for gender, link_name2 in gender_dict.items():
                        if gender not in graph[link_name]["gender_map"] and gender != graph[link_name]["gender"]:
                            graph[link_name]["gender_map"][gender] = link_name2
                            lwarn("\"" + link_name + "\" is (indirectly) linked to \"" + link_name2
                                  + "\", which is " + graph[link_name2]["gender"], " but \"" + link_name
                                  + "\" has no " + graph[link_name2]["gender"] + " version.")
                            count += 1

                            # if the newly added link was chosen between two possible words algorithmically, add this as
                            # a warning:
                            if gender_dict_alts[gender] != {link_name2}:
                                if "warning" not in graph[link_name]:
                                    graph[link_name]["warning"] = set()
                                alt_values = sorted(list(gender_dict_alts[gender]))
                                alt_values_str = ", ".join(alt_values[:-1]) + " and " + alt_values[-1]
                                new_warning = (alt_values_str + " would've all been good values for the " + gender + " "
                                               + "version of \"" + link_name + "\", but \"" + link_name2
                                               + "\" was automatically chosen based on an algorithm.")
                                lwarn(new_warning)
                                graph[link_name]["warning"].add(new_warning)

                # infect linked words with the warnings of the words they're linked to:
                link_group_warnings = set()
                for link_name in link_group:
                    if "warning" in graph[link_name]:
                        link_group_warnings |= graph[link_name]["warning"]
                if link_group_warnings:
                    for link_name in link_group:
                        if "warning" not in graph[link_name]:
                            graph[link_name]["warning"] = set()
                        graph[link_name]["warning"] |= link_group_warnings

        lwarn(count, "links created.")
        return graph

    @staticmethod
    def create_extra_links_to_gender_ambiguous_words(graph: GeneratedGenderNounData) -> GeneratedGenderNounData:
        """There may be some words A with gender x, that list another word B as their gender-y-version, but don't have
        any gender-z-version, and word B is listet as gender z for some reason.
        This method returns a version of the given graph where A links to B as its gender-z-version."""

        count = 0
        for word_name, word_data in graph.items():
            for gender_name, link_name in list(word_data["gender_map"].items()):
                if graph[link_name]["gender"] not in word_data["gender_map"]:
                    if graph[link_name]["gender"] != word_data["gender"]:
                        word_data["gender_map"][graph[link_name]["gender"]] = link_name
                        lwarn("\"" + word_name + "\" does not have a " + graph[link_name]["gender"] + " version, but a "
                              + "word it links to as its " + gender_name + " version is " + graph[link_name]["gender"]
                              + ".")
                        count += 1

        lwarn(count, "links created.")
        return graph

    @staticmethod
    def create_gendered_versions_for_words_that_end_with_gender_indicators(graph: GeneratedGenderNounData)\
            -> GeneratedGenderNounData:
        """Some words end on "-man" or "-woman", or similar things like "boy" or "maid", but don't have a male/female/
        neutral version. This method returns a version of the graph where every word of these has a male, female as well
        as neutral version. It also ensures that the resulting word doesn't end with "_" due to having its end or
        beginning removed, and that it doesn't create "empty" words."""

        # ToDo: better suggestions regarding the maid/maiden/boy/girl-stuff are welcome!
        # ToDo: running replacement strategies like this when creating a tag with a GenderedNoun-object for an unknown
        #  noun might be a good idea... this would, however, require figuring out a words gender, so the table we have
        #  here would have to be accessible outside this function.
        #  Feel free to submit a pull request for this, or an issue if you see this fitting!
        gender_indicator_tuples_table = [
            ("start", [("female", "female"),   ("male", "male"),       ("neutral", "")]),

            ("end",   [("female", "woman"),    ("male", "man"),        ("neutral", "person")]),
            ("end",   [("female", "girl"),     ("male", "boy"),        ("neutral", "bean")]),
            ("end",   [("female", "maiden"),   ("male", "gentleman"),  ("neutral", "gentleperson")]),
            ("end",   [("female", "maid"),     ("male", "manservant"), ("neutral", "servant")]),  # or butler? domestic?
            ("end",   [("female", "aunt"),     ("male", "uncle"),      ("neutral", "auncle")]),
            ("end",   [("female", "daughter"), ("male", "son"),        ("neutral", "child")]),
            ("end",   [("female", "mother"),   ("male", "father"),     ("neutral", "parent")]),
            ("end",   [("female", "wife"),     ("male", "husband"),    ("neutral", "spouse")]),
            ("end",   [("female", "niece"),    ("male", "nephew"),     ("neutral", "nibling")]),
            ("end",   [("female", "female"),   ("male", "male"),       ("neutral", "person")]),
            ("end",   [("female", "sister"),   ("male", "brother"),    ("neutral", "sibling")]),
            ("end",   [("female", "queen"),    ("male", "king"),       ("neutral", "monarch")]),

            ("start", [("female", "woman"),    ("male", "man"),        ("neutral", "person")]),
            ("start", [("female", "girl"),     ("male", "boy"),        ("neutral", "bean")]),
            ("start", [("female", "maiden"),   ("male", "gentleman"),  ("neutral", "gentleperson")]),
            ("start", [("female", "maid"),     ("male", "manservant"), ("neutral", "servant")]),
            ("start", [("female", "aunt"),     ("male", "uncle"),      ("neutral", "auncle")]),
            ("start", [("female", "daughter"), ("male", "son"),        ("neutral", "child")]),
            ("start", [("female", "mother"),   ("male", "father"),     ("neutral", "parent")]),
            ("start", [("female", "wife"),     ("male", "husband"),    ("neutral", "spouse")]),
            ("start", [("female", "niece"),    ("male", "nephew"),     ("neutral", "nibling")]),
            ("start", [("female", "female"),   ("male", "male"),       ("neutral", "person")]),
            ("start", [("female", "sister"),   ("male", "brother"),    ("neutral", "sibling")]),
            ("start", [("female", "queen"),    ("male", "king"),       ("neutral", "monarch")])
        ]

        words_created = 0
        links_created = 0
        for word_name, word_data in list(graph.items()):
            for end_or_start, gender_indicator_tuples in gender_indicator_tuples_table:
                created_corresponding_gendered_versions = False
                for gender, gender_indicator in gender_indicator_tuples:
                    if (((end_or_start == "end" and word_name.endswith(gender_indicator))
                            or (end_or_start == "start" and word_name.startswith(gender_indicator)))
                            and gender_indicator):
                        not_applicable = False
                        if gender != "neutral":
                            other_gender_indicator_tuples = [
                                t for t in gender_indicator_tuples if t != (gender, gender_indicator)]
                            for other_gender, other_gender_indicator in other_gender_indicator_tuples:
                                if other_gender not in set(word_data["gender_map"].keys()) | {word_data["gender"]}:
                                    # figure out the potential new word:
                                    if end_or_start == "end":
                                        new_gendered_version = (word_name[:-len(gender_indicator)]
                                                                + other_gender_indicator)
                                    else:
                                        new_gendered_version = (other_gender_indicator
                                                                + word_name[len(gender_indicator):])
                                    new_gendered_version = new_gendered_version.lstrip("_").rstrip("_")
                                    if new_gendered_version == "":
                                        not_applicable = True
                                        continue
                                    # create the new word or link to it:
                                    lwarn("\"" + word_name + "\" ends with \"-" + gender_indicator
                                          + "\", but it has no " + other_gender + " version. ", end="")
                                    word_data["gender_map"][other_gender] = new_gendered_version
                                    links_created += 1
                                    if new_gendered_version not in graph:
                                        lwarn("Creating one as \"" + new_gendered_version + "\"!")
                                        words_created += 1
                                        graph[new_gendered_version] = {
                                            "gender": other_gender,
                                            "gender_map": dict(),
                                            "warning": {"\"" + new_gendered_version
                                                        + "\" was automatically generated as the " + other_gender
                                                        + " version of a word due to the "
                                                        + ("ending" if end_or_start == "end" else "beginning")
                                                        + " of said " + "word."}
                                        }
                                    else:
                                        lwarn("Linking to \"" + new_gendered_version + "\".")
                                    # Add a new warning that a link was created between those two:
                                    if "warning" not in word_data:
                                        word_data["warning"] = set()
                                    word_data["warning"].add("\"" + new_gendered_version
                                                             + "\" was automatically linked to \"" + word_name
                                                             + "\" as its " + gender + " version due to the "
                                                             + ("ending" if end_or_start == "end" else "beginning")
                                                             + " of said " + "word.")

                        if not not_applicable:
                            created_corresponding_gendered_versions = True
                            break
                if created_corresponding_gendered_versions:
                    break

        lwarn(words_created, "new words created.")
        lwarn(links_created, "new links created.")
        return graph

    @staticmethod
    def find_words_with_no_neutral_form(graph: GeneratedGenderNounData) -> GeneratedGenderNounData:
        """Informs about every word that does not have a neutral version, and uses this word as its own neutral version.
        Gives each of these words a neutral form by using the male version of the word by default and the female version
        if there is no male one."""

        count_male = 0
        count_used_male = 0
        count_used_female = 0
        count = 0
        for word_name, word_data in list(graph.items()):
            if word_data["gender"] != "neutral":
                if "neutral" not in word_data["gender_map"]:
                    lwarn("\"" + word_name + "\" is neither neutral, nor does it link to a neutral version.")
                    count += 1
                    if word_data["gender"] == "male":
                        word_data["gender_map"]["neutral"] = word_name
                        if "warning" not in word_data:
                            word_data["warning"] = set()
                        word_data["warning"].add("\"" + word_name + "\" does not have an explicit, human-picked "
                                                 + "neutral version, and since it is male, it's automatically used as "
                                                 + "own neutral version. This is done because lots of words in the "
                                                 + "database are wrongly marked as male even though they are actually "
                                                 + "neutral.")
                        count_male += 1
                    elif "male" in word_data["gender_map"]:
                        word_data["gender_map"]["neutral"] = word_data["gender_map"]["male"]
                        if "warning" not in word_data:
                            word_data["warning"] = set()
                        word_data["warning"].add("\"" + word_name + "\" does not have an explicit, human-picked "
                                                 + "neutral version, and since it has a male version, this version is "
                                                 + "automatically used as its neutral version. This is done because "
                                                 + "lots of words in the  database are wrongly marked as male even "
                                                 + "though they are actually neutral.")
                        count_used_male += 1
                    else:
                        if "warning" not in word_data:
                            word_data["warning"] = set()
                        word_data["gender_map"]["neutral"] = word_name
                        word_data["warning"].add("\"" + word_name + "\" neither has a male nor a female version, so "
                                                 + "it is used as its own neutral version.")
                        count_used_female += 1
                    lwarn(word_data["warning"])

        lwarn(count, "instances found.")
        lwarn(count_male, "instances where male words where used as their own neutral version,")
        lwarn(count_used_male, "instances where a male version of a word was used as its neutral version,")
        lwarn(count_used_female, "instances where a female word was used as its own neutral version.")

        return graph

    @staticmethod
    def create_full_graph_from_web() -> GeneratedGenderNounData:
        """A pipeline that combines all methods of this method collection to pull a graph of gendered words from the web
        and automatically fill all holes this graph has left open."""

        graph = GenderNounDataHandler.load_from_web()
        lwarn("")
        graph = GenderNounDataHandler.remove_words_that_are_not_nouns(graph)

        lwarn("")
        graph = GenderNounDataHandler.make_sure_all_referenced_words_exist(graph)
        lwarn("")
        graph = GenderNounDataHandler.make_all_links_two_sided(graph)

        lwarn("")
        graph = GenderNounDataHandler.create_extra_links_to_gender_ambiguous_words(graph)
        lwarn("")
        graph = GenderNounDataHandler.make_all_links_two_sided(graph, log_clashs=True)

        lwarn("")
        graph = GenderNounDataHandler.create_gendered_versions_for_words_that_end_with_gender_indicators(graph)
        lwarn("")
        graph = GenderNounDataHandler.make_all_links_two_sided(graph)

        lwarn("")
        graph = GenderNounDataHandler.find_words_with_no_neutral_form(graph)
        return graph


# the final dict:

# !!! The following lines of code (or, to say it pythonic, the following dict) contain license information.
# This licesne information is part of this code in that it is processed by the code and then written into a different
# file, alongside the runtime-generated content it refers to. IT DOES NOT, HOWEVER; REFER TO THE CONTENT OF THE FILE YOU
# FOUND IT IN, IF YOU FOUND IT ALONGSIDE THIS DISCLAIMER.
GDN_META_DATA = {
    "license": '\
Most parts of the contents of this file were created based on WordNet content, which is licensed under the WordNet\
 license (https://wordnet.princeton.edu/license-and-commercial-use), by GitHub-user ecmonson\
 (https://github.com/ecmonsen), who applied modifications to it licensed under the Creative Commons Attribution 3.0\
 (https://creativecommons.org/licenses/by/3.0/us/), as explained here\
 (https://github.com/ecmonsen/gendered_words#license).\
 The dataset was then taken by GitHub-user phseiff (https://phseiff.com), who converted it into a different format,\
 effectively stripping all WordNet content but the selection of hyponyms for "person" from it, and closed some holes in\
 it, as outlines in this (https://github.com/ecmonsen/gendered_words/issues/1) issue.\
 Contributions by phseiff are licensed under Creative Commons Attribution 3.0 as well.'
}
noun_data_location = os.path.join(__file__.rsplit(os.sep, 1)[0], "data/gendered-nouns.gdn")
GENDER_DICT: GeneratedGenderNounData
try:
    GENDER_DICT, _ = GenderNounDataHandler.load_from_disk(noun_data_location)
except FileNotFoundError:
    warnings.WarningManager.raise_warning(None, warnings.GenderedNounsBuildFromWebWarning)
    g = GenderNounDataHandler.create_full_graph_from_web()
    GenderNounDataHandler.save_to_disk(g, noun_data_location, **GDN_META_DATA)
    GENDER_DICT, _ = GenderNounDataHandler.load_from_disk(noun_data_location)


# Representation of a not-yet correctly gendered noun:

class GenderedNoun:
    """A representation of a gendered noun, with methods to get gendered equivalents of it."""

    def __init__(self, word: str):
        """Generates an object to get gendered versions of the given noun for different genders."""

        # save the full word, but lookup the word in lowercase:
        self.word = word

        # raise warnings if the word is not a word/ noun/ person noun:
        if word not in GENDER_DICT:
            if not is_a_word(word):
                warnings.WarningManager.raise_warning("\"" + word + "\" is not a known word, so gender*render might not"
                                                      + " be able to gender it correctly.", warnings.NotAWordWarning)
            elif not is_a_noun(word):
                warnings.WarningManager.raise_warning("\"" + word + "\" is not a known noun, so gender*render might not"
                                                      + " be able to gender it correctly.", warnings.NotANounWarning)
            else:
                warnings.WarningManager.raise_warning("\"" + word + "\" is not a hyponym for person, so gender*render "
                                                      + "might not be able to gender it correctly.",
                                                      warnings.NotAPersonNounWarning)
        elif "warning" in GENDER_DICT[word]:
            warnings.WarningManager.raise_warning("warnings for \"" + word + "\":\n"
                                                  + "\n".join(list(GENDER_DICT[word]["warning"])),
                                                  warnings.NounGenderingGuessingsWarning)
            # ToDo: Maybe only print those warnings that contain `"\"" + word + "\""` in them? This would require
            #  reviewing all warnings attached to words by this modules code, to be sure this actually prints all
            #  relevant warnings, as well as injecting some trivial code here and generally discussing this idea in an
            #  issue.
            #  See also the comment in test/test_gender_nouns in test_create_full_graph_from_web.

    def render_noun(self, gender: GeneratedDataGender) -> str:
        """Returns the correctly gendered version of itself as a string. gender must be either "male", "female" or
        "neutral"."""  # ToDo: Re-test this since capitalization is no longer supported.

        # return the correctly gendered version of the word:
        word = self.word
        if word in GENDER_DICT:
            word_data = GENDER_DICT[word]
            # look for the neutral version if there is no version of the given gender:
            if gender not in set(word_data["gender_map"].keys()) | {word_data["gender"]}:
                gender = "neutral"
            # return the word if it is the right gender:
            if GENDER_DICT[word]["gender"] == gender:
                result = word
            # otherwise, return the correctly gendered version from the gender_map:
            else:
                result = word_data["gender_map"][gender]
        else:
            result = self.word
        return result.replace("_", " ")

    def __eq__(self, other) -> bool:
        """Checks whether two GenderedNoun-representations are identical, based on what noun they represent."""
        if isinstance(other, self.__class__):
            return self.word == other.word
        else:
            return False

Functions

def is_a_word(word: str) ‑> bool

Checks whether the given word is a valid english word.

Expand source code Browse git
def is_a_word(word: str) -> bool:
    """Checks whether the given word is a valid english word."""
    return set(word.lower().split("_")).issubset(english_vocab) or word.lower() in english_vocab
def is_a_noun(word: str) ‑> bool

Checks whether the given word is a valid english noun.

Expand source code Browse git
def is_a_noun(word: str) -> bool:
    """Checks whether the given word is a valid english noun."""
    return set(word.lower().split("_")).issubset(english_nouns) or word.lower() in english_nouns
def is_a_person_noun(word: str) ‑> bool

Checks whether the given word is a valid english person noun.

Expand source code Browse git
def is_a_person_noun(word: str) -> bool:
    """Checks whether the given word is a valid english person noun."""
    return set(word.lower().split("_")).issubset(english_person_nouns) or word.lower() in english_person_nouns
def lwarn(*text, sep=' ', end='\n')

Prints the given text, but only if the "BuildingGenderedNounDataLogging"-warning is enabled. The name "lwarn" is supposed to stand for "log warning".

Expand source code Browse git
def lwarn(*text, sep=" ", end="\n"):
    """Prints the given text, but only if the "BuildingGenderedNounDataLogging"-warning is enabled.
    The name "lwarn" is supposed to stand for "log warning"."""
    warnings.WarningManager.raise_warning(sep.join([str(t) for t in text]) + end,
                                          warnings.BuildingGenderedNounDataLogging)

Classes

class OriginalDataGenderedVersionInfo (*args, **kwargs)

dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

Expand source code Browse git
class OriginalDataGenderedVersionInfo(TypedDict):
    parts_of_speech: Optional[str]
    word: str

Ancestors

  • builtins.dict

Class variables

var parts_of_speech : Union[str, NoneType]
var word : str
class OriginalDataGenderMapping (*args, **kwargs)

dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

Expand source code Browse git
class OriginalDataGenderMapping(TypedDict):
    f: Optional[List[OriginalDataGenderedVersionInfo]]
    m: Optional[List[OriginalDataGenderedVersionInfo]]
    n: Optional[List[OriginalDataGenderedVersionInfo]]
    h: Optional[List[OriginalDataGenderedVersionInfo]]

Ancestors

  • builtins.dict

Class variables

var f : Union[List[OriginalDataGenderedVersionInfo], NoneType]
var m : Union[List[OriginalDataGenderedVersionInfo], NoneType]
var n : Union[List[OriginalDataGenderedVersionInfo], NoneType]
var h : Union[List[OriginalDataGenderedVersionInfo], NoneType]
class OriginalDataWord (*args, **kwargs)

dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

Expand source code Browse git
class OriginalDataWord(TypedDict):
    word: str
    wordnet_senseno: Optional[str]
    gender: OriginalDataGender
    gender_map: Optional[OriginalDataGenderMapping]

Ancestors

  • builtins.dict

Class variables

var word : str
var wordnet_senseno : Union[str, NoneType]
var gender : typing_extensions.Literal['f', 'm', 'n', 'o']
var gender_map : Union[OriginalDataGenderMapping, NoneType]
class GeneratedDataGenderMapping (*args, **kwargs)

dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

Expand source code Browse git
class GeneratedDataGenderMapping(TypedDict):
    neutral: Optional[str]
    male: Optional[str]
    female: Optional[str]

Ancestors

  • builtins.dict

Class variables

var neutral : Union[str, NoneType]
var male : Union[str, NoneType]
var female : Union[str, NoneType]
class GeneratedDataWord (*args, **kwargs)

dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

Expand source code Browse git
class GeneratedDataWord(TypedDict):
    gender: str
    gender_map: GeneratedDataGenderMapping
    warning: Optional[Set[str]]

Ancestors

  • builtins.dict

Class variables

var gender : str
var gender_mapGeneratedDataGenderMapping
var warning : Union[Set[str], NoneType]
class GenderNounDataHandler

Bundles several static methods to handle and create data that describes the differently gendered versions of gendered nouns. These methods form a pipeline used for creating a full linked graph of gendered nouns. All methods of the pipeline print extensive logs if the corresponding "warning" is enabled, and may or may not modify their input in-place. Please note that this pipeline is tested only in environments with nltk installed.

Expand source code Browse git
class GenderNounDataHandler:
    """Bundles several static methods to handle and create data that describes the differently gendered versions of
    gendered nouns.
    These methods form a pipeline used for creating a full linked graph of gendered nouns.
    All methods of the pipeline print extensive logs if the corresponding "warning" is enabled, and may or may not
    modify their input in-place.
    Please note that this pipeline is tested only in environments with nltk installed."""

    @staticmethod
    def load_from_web() -> GeneratedGenderNounData:
        """Creates a JSON object describing the differently gendered versions of every gendered noun.
        Word with no wordnet_senseno-attribute (that are not from wordnet) are ignored, and words with "other" as gender
        are re-gendered as "neutral".

        The data used for this is taken from https://github.com/ecmonsen/gendered_words
        (which is not by me; see the repository for the license)."""

        grammatical_genders = {"m": "male", "f": "female", "n": "neutral"}

        # load from the web, in a completely wrong format that we have yet to change:
        raw_json: OriginalGenderNounData = json.loads(requests.get(
            "https://raw.githubusercontent.com/phseiff/gendered_words/master/gendered_words.json").text)

        # change the format:
        result = dict()
        for word in raw_json:
            if "wordnet_senseno" in word:
                result[word["word"]] = {"gender_map": dict()}
                if "gender_map" in word:
                    for short, long in grammatical_genders.items():
                        if short in word["gender_map"]:
                            result[word["word"]]["gender_map"][long] = word["gender_map"][short][0]["word"].replace(
                                " ", "_")
                if word["gender"] == "o":
                    lwarn("Found an \"other\"-word! It's \"" + word["word"] + "\".")
                    result[word["word"]]["gender"] = "neutral"
                else:
                    result[word["word"]]["gender"] = grammatical_genders[word["gender"]]
            else:
                lwarn("\"" + word["word"] + "\" ignored because it is not part of wordnet and therefore not a hyponyms "
                      + "for a person.")

        lwarn(len(result), "words found.")
        return result

    @staticmethod
    def load_from_disk(file_name: str) -> (GeneratedGenderNounData, Dict[str, str]):
        """Loads the gendered nouns from a piece of json data contained in the given file.
        The json object in this file must have a data-attribute, which contains the actual gendered noun data, and may
        contain an arbitrary amount of meta data in the form of strings mapped to strings, which will be returned as a
        dict as the second return value.
        The first return value is the actual gendered noun data, with the only change being that the
        `warnings`-attribute of every noun is converted from a list to a set.
        This assumes the gendered noun data to be of the same type as the data used throughout all the other methods
        of this class, NOT the format of the repository from which `load_from_web` loads its data."""
        with open(file_name, "r") as f:
            code = f.read()
        meta_data = json.loads(code)
        actual_data = meta_data["data"]
        del meta_data["data"]

        for word_data in actual_data.values():
            if "warning" in word_data:
                word_data["warning"] = set(word_data["warning"])
        return actual_data, meta_data

    @staticmethod
    def save_to_disk(graph: GeneratedGenderNounData, file_name: str, **meta_data: Dict[str, str]) -> None:
        """Saves the given gendered nouns data to the given file as json. The data is saved without any modifications,
        except for the `warnings`-attribute of all noun data, which is converted from a set to a list to be compatible
        with standard json format, in the data-attribute of the resulting json file.
        `meta_data` may be used to describe additional meta data in the form of strings of the resulting file.
        This assumes the given data to be of the same type as the data used throughout all the other methods of this
        class, NOT the format of the repository from which `load_from_web` loads its data.
        The data saved with this method can be read again with `load_from_disk`, so that `save_to_disk(data, file_name)`
        implies `load_from_disk(file_name) == data`.
        This method is guaranteed to not change the given data in-place."""
        graph_copy: dict = copy.deepcopy(graph)
        for word_data in graph_copy.values():
            if "warning" in word_data:
                word_data["warning"] = list(word_data["warning"])
                word_data["warning"].sort()
        resulting_data: dict = copy.deepcopy(meta_data)
        resulting_data["data"] = graph_copy
        with open(file_name, "w") as f:
            json.dump(resulting_data, f, indent=4, sort_keys=True)

    @staticmethod
    def remove_words_that_are_not_nouns(graph: GeneratedGenderNounData) -> GeneratedGenderNounData:
        """Removes all elements that are not nouns from the graph. Words that are not nouns, but whose gendered versions
        contain nouns for some reason, are not purged by this.
        Returns the result.
        This step should usually do nothing, since all words that are not nouns should already be filtered out when
        the data is read via load_from_web, but this uses an undocumented feature of the online data set it uses, so
        this function ensures that changes of that feature don't break this code.

        This step may or may not change the given object in-place."""

        grammatical_genders = ["male", "female", "neutral"]

        count = 0
        for word_name, word_data in list(graph.items()):
            is_noun = is_a_noun(word_name)
            for grammatical_gender in grammatical_genders:
                if grammatical_gender in word_data["gender_map"]:
                    is_noun = is_noun or is_a_noun(word_data["gender_map"][grammatical_gender])
            if not is_noun:
                lwarn("Deleting \"" + word_name + "\", since it is not a noun!")
                count += 1
                del graph[word_name]

        lwarn(count, "words deleted.")
        return graph

    @staticmethod
    def make_sure_all_referenced_words_exist(graph: GeneratedGenderNounData) -> GeneratedGenderNounData:
        """Returns a version of the graph where every word linked as a differently gendered version of a word exists."""

        count = 0
        for word_name, word_data in list(graph.items()):
            for gender_name, link_name in word_data["gender_map"].items():
                if link_name not in graph:
                    lwarn("\"" + word_name + "\" lists \"" + link_name + "\" as its " + gender_name + " version, but \""
                          + link_name + "\" does not exist in the word data file.")
                    count += 1
                    graph[link_name] = {"gender": gender_name, "gender_map": {word_data["gender"]: word_name}}
                # # Commented out, since it is already covered by create_extra_links_to_gender_ambiguous_words():
                # if graph[link_name]["gender"] != gender_name:
                #     lwarn("\"" + link_name + "\" is \"" + word_name + "\"s " + gender_name + "s version, but is not "
                #           + gender_name + ".")

        lwarn(count, "new words created.")
        return graph

    @staticmethod
    def choose_better_word(option1, option2, log=False):
        """There are cases where a word has two different gendered versions for the same gender;
        for example, "foremother" and "ancestress" are both female words for an ancestor. In these cases, the "better"
        one will be chosen according to this pipeline.
        This is relevant for `make_all_links_two_sided`, for example (see docstring).

        If `log` is set, having to resort to alphabetical decisions is logged."""

        # ToDo: Add further rules to this, or better yet, make manual (no automated data!) PRs to
        #  https://github.com/phseiff/gendered-words to clear up all these unnecessary unclearnesses.
        bad_things_in_order_of_badness: List[Callable[[str], int]] = [
            # the sooner the lambda, the worse it is:
            lambda word: len([c for c in word if c == "_"]),
            lambda word: int(word.endswith("man") or word.endswith("woman")),
            lambda word: int(word.endswith("person")),
            lambda word: int(word.endswith("mother") or word.endswith("father")),
            lambda word: int(word.endswith("parent"))
        ]
        for badness_meter in bad_things_in_order_of_badness:
            if badness_meter(option1) > badness_meter(option2):
                return option2
            elif badness_meter(option2) > badness_meter(option1):
                return option1
        # otherwise, return the alphabetically first one:
        if log:
            lwarn("Had to alphabetically decide between", option1, "and", option2)
        return sorted([option1, option2])[0]

    @staticmethod
    def make_all_links_two_sided(graph: GeneratedGenderNounData, log_clashs=False)\
            -> GeneratedGenderNounData:
        """Returns a version of the graph where every word linked to links back to the word linking to it, if this words
        gender is not yet in its gender mapping.
        This also goes for triangles where a links to b and c, but b and c are not linked.

        If `log_clashs` is set to True, clashes are logged (these are cases where there are multiple versions available
        for the gendered version of a word). This should only be enabled before
        `create_gendered_versions_for_words_that_end_with_gender_indicators` is called on the data for the first time,
        since afterwards, multiple words might be connected that wheren't conntected before. One example would be that
        "wonder_woman" and "wonder_girl" are both considered female, and both will get "wonder_person" assigned as their
        neutral version by `create_gendered_versions_for_words_that_end_with_gender_indicators`. This is not an issue
        since one of them already links to "wonder_boy", and the other one to "wonder_man" from the beginning, but
        they will still be considered clashing by the algorithm, so calling `make_all_links_two_sided` with
        `log_clashs` set to True after applying `create_gendered_versions_for_words_that_end_with_gender_indicators` to
         the data will create logs for two words that don't clash more than linguistically necessary.

         If `infect_warnings` isn't explicitely set to False, all warnings added to a word are "passed" to every word
         it's linked to."""

        count = 0

        # makes sure we do not report a clash twice.
        already_reported_clashes: Set[str] = set()

        def assign_value_or_use_old_one(gender_dict, gender, value):
            """Assigns a gendered version (`value`) to the given `gender` in the `gender_dict`.
            If there is already a value for the given gender, a decision is made whether to replace the old value in the
            gender dict with the newfound alternative one, and we then log the decision if `log_clashs` is set, and
            collect information required to build a warning to attach to the word to indicate that there was an
            algorithmic choose between two options."""
            if gender in gender_dict and gender_dict[gender] != value:
                # log the clash if it wasn't logged yet:
                clash_report = value + " clashes with " + gender_dict[gender]
                if log_clashs and clash_report not in already_reported_clashes:
                    lwarn(clash_report)
                    already_reported_clashes.add(clash_report)

                # choose the better of both options (keep the old word or rather use the new one):
                chosen_option = GenderNounDataHandler.choose_better_word(value, gender_dict[gender], log=log_clashs)
            else:
                chosen_option = value

            gender_dict[gender] = chosen_option
            gender_dict_alts[gender] |= {chosen_option, value}

        def get_all_connected_words(word_name, data: Union[Set[str], None] = None):
            """Returns a set of all words that are indirectly or directly linked to the given word."""
            if not data:
                data = {word_name}
            for link_name in graph[word_name]["gender_map"].values():
                if link_name not in data:
                    data.add(link_name)
                    get_all_connected_words(link_name, data)
                if link_name in link_groups:
                    data |= link_groups[link_name]
            return data

        link_groups: Dict[str, Set[str]] = dict()
        words_we_already_visited: Set[str] = set()
        # first iteration is to make FULL links groups of connected words before we start linking words to each other:
        for word_name, word_data in graph.items():
            if word_name not in words_we_already_visited:

                # create group of all words that are linked to this word (including those that are once removed):
                link_group = get_all_connected_words(word_name)
                for link_name in link_group:
                    if link_name in link_groups:
                        link_groups[link_name] |= link_group
                    else:
                        link_groups[link_name] = link_group

                # add these words to the visited words now:
                words_we_already_visited |= link_group

        # second iteration is to actually do the linking:
        words_we_already_visited: Set[str] = set()
        for word_name in graph.keys():
            # if we did not visit this word yet:
            if word_name not in words_we_already_visited:
                link_group = link_groups[word_name]
                words_we_already_visited |= link_group

                # create a gender-dict for all of them:
                genders_we_are_very_sure_about: Set[GeneratedDataGender] = set()
                gender_dict: Dict[GeneratedDataGender, str] = dict()
                gender_dict_alts: Dict[GeneratedDataGender, Set[str]] = {"female": set(), "male": set(),
                                                                         "neutral": set()}
                for link_name in link_group:
                    # add gender of the individual...
                    gender = graph[link_name]["gender"]
                    if gender in genders_we_are_very_sure_about:
                        assign_value_or_use_old_one(gender_dict, gender, link_name)
                    else:
                        gender_dict_alts[gender] = {link_name}
                        gender_dict[gender] = link_name
                    genders_we_are_very_sure_about.add(gender)

                    # ...as well as genders it specifies in its gender_map, but only if we couldn't find these otherwise
                    for gender, link_name2 in graph[link_name]["gender_map"].items():
                        if gender not in genders_we_are_very_sure_about:
                            assign_value_or_use_old_one(gender_dict, gender, link_name2)

                # link words with each other, so each word gets the full gender dict to call its own,
                # but only for genders it does not have in its own original gender dict yet, so manual links in the
                # already given data take precedence:
                for link_name in link_group:
                    for gender, link_name2 in gender_dict.items():
                        if gender not in graph[link_name]["gender_map"] and gender != graph[link_name]["gender"]:
                            graph[link_name]["gender_map"][gender] = link_name2
                            lwarn("\"" + link_name + "\" is (indirectly) linked to \"" + link_name2
                                  + "\", which is " + graph[link_name2]["gender"], " but \"" + link_name
                                  + "\" has no " + graph[link_name2]["gender"] + " version.")
                            count += 1

                            # if the newly added link was chosen between two possible words algorithmically, add this as
                            # a warning:
                            if gender_dict_alts[gender] != {link_name2}:
                                if "warning" not in graph[link_name]:
                                    graph[link_name]["warning"] = set()
                                alt_values = sorted(list(gender_dict_alts[gender]))
                                alt_values_str = ", ".join(alt_values[:-1]) + " and " + alt_values[-1]
                                new_warning = (alt_values_str + " would've all been good values for the " + gender + " "
                                               + "version of \"" + link_name + "\", but \"" + link_name2
                                               + "\" was automatically chosen based on an algorithm.")
                                lwarn(new_warning)
                                graph[link_name]["warning"].add(new_warning)

                # infect linked words with the warnings of the words they're linked to:
                link_group_warnings = set()
                for link_name in link_group:
                    if "warning" in graph[link_name]:
                        link_group_warnings |= graph[link_name]["warning"]
                if link_group_warnings:
                    for link_name in link_group:
                        if "warning" not in graph[link_name]:
                            graph[link_name]["warning"] = set()
                        graph[link_name]["warning"] |= link_group_warnings

        lwarn(count, "links created.")
        return graph

    @staticmethod
    def create_extra_links_to_gender_ambiguous_words(graph: GeneratedGenderNounData) -> GeneratedGenderNounData:
        """There may be some words A with gender x, that list another word B as their gender-y-version, but don't have
        any gender-z-version, and word B is listet as gender z for some reason.
        This method returns a version of the given graph where A links to B as its gender-z-version."""

        count = 0
        for word_name, word_data in graph.items():
            for gender_name, link_name in list(word_data["gender_map"].items()):
                if graph[link_name]["gender"] not in word_data["gender_map"]:
                    if graph[link_name]["gender"] != word_data["gender"]:
                        word_data["gender_map"][graph[link_name]["gender"]] = link_name
                        lwarn("\"" + word_name + "\" does not have a " + graph[link_name]["gender"] + " version, but a "
                              + "word it links to as its " + gender_name + " version is " + graph[link_name]["gender"]
                              + ".")
                        count += 1

        lwarn(count, "links created.")
        return graph

    @staticmethod
    def create_gendered_versions_for_words_that_end_with_gender_indicators(graph: GeneratedGenderNounData)\
            -> GeneratedGenderNounData:
        """Some words end on "-man" or "-woman", or similar things like "boy" or "maid", but don't have a male/female/
        neutral version. This method returns a version of the graph where every word of these has a male, female as well
        as neutral version. It also ensures that the resulting word doesn't end with "_" due to having its end or
        beginning removed, and that it doesn't create "empty" words."""

        # ToDo: better suggestions regarding the maid/maiden/boy/girl-stuff are welcome!
        # ToDo: running replacement strategies like this when creating a tag with a GenderedNoun-object for an unknown
        #  noun might be a good idea... this would, however, require figuring out a words gender, so the table we have
        #  here would have to be accessible outside this function.
        #  Feel free to submit a pull request for this, or an issue if you see this fitting!
        gender_indicator_tuples_table = [
            ("start", [("female", "female"),   ("male", "male"),       ("neutral", "")]),

            ("end",   [("female", "woman"),    ("male", "man"),        ("neutral", "person")]),
            ("end",   [("female", "girl"),     ("male", "boy"),        ("neutral", "bean")]),
            ("end",   [("female", "maiden"),   ("male", "gentleman"),  ("neutral", "gentleperson")]),
            ("end",   [("female", "maid"),     ("male", "manservant"), ("neutral", "servant")]),  # or butler? domestic?
            ("end",   [("female", "aunt"),     ("male", "uncle"),      ("neutral", "auncle")]),
            ("end",   [("female", "daughter"), ("male", "son"),        ("neutral", "child")]),
            ("end",   [("female", "mother"),   ("male", "father"),     ("neutral", "parent")]),
            ("end",   [("female", "wife"),     ("male", "husband"),    ("neutral", "spouse")]),
            ("end",   [("female", "niece"),    ("male", "nephew"),     ("neutral", "nibling")]),
            ("end",   [("female", "female"),   ("male", "male"),       ("neutral", "person")]),
            ("end",   [("female", "sister"),   ("male", "brother"),    ("neutral", "sibling")]),
            ("end",   [("female", "queen"),    ("male", "king"),       ("neutral", "monarch")]),

            ("start", [("female", "woman"),    ("male", "man"),        ("neutral", "person")]),
            ("start", [("female", "girl"),     ("male", "boy"),        ("neutral", "bean")]),
            ("start", [("female", "maiden"),   ("male", "gentleman"),  ("neutral", "gentleperson")]),
            ("start", [("female", "maid"),     ("male", "manservant"), ("neutral", "servant")]),
            ("start", [("female", "aunt"),     ("male", "uncle"),      ("neutral", "auncle")]),
            ("start", [("female", "daughter"), ("male", "son"),        ("neutral", "child")]),
            ("start", [("female", "mother"),   ("male", "father"),     ("neutral", "parent")]),
            ("start", [("female", "wife"),     ("male", "husband"),    ("neutral", "spouse")]),
            ("start", [("female", "niece"),    ("male", "nephew"),     ("neutral", "nibling")]),
            ("start", [("female", "female"),   ("male", "male"),       ("neutral", "person")]),
            ("start", [("female", "sister"),   ("male", "brother"),    ("neutral", "sibling")]),
            ("start", [("female", "queen"),    ("male", "king"),       ("neutral", "monarch")])
        ]

        words_created = 0
        links_created = 0
        for word_name, word_data in list(graph.items()):
            for end_or_start, gender_indicator_tuples in gender_indicator_tuples_table:
                created_corresponding_gendered_versions = False
                for gender, gender_indicator in gender_indicator_tuples:
                    if (((end_or_start == "end" and word_name.endswith(gender_indicator))
                            or (end_or_start == "start" and word_name.startswith(gender_indicator)))
                            and gender_indicator):
                        not_applicable = False
                        if gender != "neutral":
                            other_gender_indicator_tuples = [
                                t for t in gender_indicator_tuples if t != (gender, gender_indicator)]
                            for other_gender, other_gender_indicator in other_gender_indicator_tuples:
                                if other_gender not in set(word_data["gender_map"].keys()) | {word_data["gender"]}:
                                    # figure out the potential new word:
                                    if end_or_start == "end":
                                        new_gendered_version = (word_name[:-len(gender_indicator)]
                                                                + other_gender_indicator)
                                    else:
                                        new_gendered_version = (other_gender_indicator
                                                                + word_name[len(gender_indicator):])
                                    new_gendered_version = new_gendered_version.lstrip("_").rstrip("_")
                                    if new_gendered_version == "":
                                        not_applicable = True
                                        continue
                                    # create the new word or link to it:
                                    lwarn("\"" + word_name + "\" ends with \"-" + gender_indicator
                                          + "\", but it has no " + other_gender + " version. ", end="")
                                    word_data["gender_map"][other_gender] = new_gendered_version
                                    links_created += 1
                                    if new_gendered_version not in graph:
                                        lwarn("Creating one as \"" + new_gendered_version + "\"!")
                                        words_created += 1
                                        graph[new_gendered_version] = {
                                            "gender": other_gender,
                                            "gender_map": dict(),
                                            "warning": {"\"" + new_gendered_version
                                                        + "\" was automatically generated as the " + other_gender
                                                        + " version of a word due to the "
                                                        + ("ending" if end_or_start == "end" else "beginning")
                                                        + " of said " + "word."}
                                        }
                                    else:
                                        lwarn("Linking to \"" + new_gendered_version + "\".")
                                    # Add a new warning that a link was created between those two:
                                    if "warning" not in word_data:
                                        word_data["warning"] = set()
                                    word_data["warning"].add("\"" + new_gendered_version
                                                             + "\" was automatically linked to \"" + word_name
                                                             + "\" as its " + gender + " version due to the "
                                                             + ("ending" if end_or_start == "end" else "beginning")
                                                             + " of said " + "word.")

                        if not not_applicable:
                            created_corresponding_gendered_versions = True
                            break
                if created_corresponding_gendered_versions:
                    break

        lwarn(words_created, "new words created.")
        lwarn(links_created, "new links created.")
        return graph

    @staticmethod
    def find_words_with_no_neutral_form(graph: GeneratedGenderNounData) -> GeneratedGenderNounData:
        """Informs about every word that does not have a neutral version, and uses this word as its own neutral version.
        Gives each of these words a neutral form by using the male version of the word by default and the female version
        if there is no male one."""

        count_male = 0
        count_used_male = 0
        count_used_female = 0
        count = 0
        for word_name, word_data in list(graph.items()):
            if word_data["gender"] != "neutral":
                if "neutral" not in word_data["gender_map"]:
                    lwarn("\"" + word_name + "\" is neither neutral, nor does it link to a neutral version.")
                    count += 1
                    if word_data["gender"] == "male":
                        word_data["gender_map"]["neutral"] = word_name
                        if "warning" not in word_data:
                            word_data["warning"] = set()
                        word_data["warning"].add("\"" + word_name + "\" does not have an explicit, human-picked "
                                                 + "neutral version, and since it is male, it's automatically used as "
                                                 + "own neutral version. This is done because lots of words in the "
                                                 + "database are wrongly marked as male even though they are actually "
                                                 + "neutral.")
                        count_male += 1
                    elif "male" in word_data["gender_map"]:
                        word_data["gender_map"]["neutral"] = word_data["gender_map"]["male"]
                        if "warning" not in word_data:
                            word_data["warning"] = set()
                        word_data["warning"].add("\"" + word_name + "\" does not have an explicit, human-picked "
                                                 + "neutral version, and since it has a male version, this version is "
                                                 + "automatically used as its neutral version. This is done because "
                                                 + "lots of words in the  database are wrongly marked as male even "
                                                 + "though they are actually neutral.")
                        count_used_male += 1
                    else:
                        if "warning" not in word_data:
                            word_data["warning"] = set()
                        word_data["gender_map"]["neutral"] = word_name
                        word_data["warning"].add("\"" + word_name + "\" neither has a male nor a female version, so "
                                                 + "it is used as its own neutral version.")
                        count_used_female += 1
                    lwarn(word_data["warning"])

        lwarn(count, "instances found.")
        lwarn(count_male, "instances where male words where used as their own neutral version,")
        lwarn(count_used_male, "instances where a male version of a word was used as its neutral version,")
        lwarn(count_used_female, "instances where a female word was used as its own neutral version.")

        return graph

    @staticmethod
    def create_full_graph_from_web() -> GeneratedGenderNounData:
        """A pipeline that combines all methods of this method collection to pull a graph of gendered words from the web
        and automatically fill all holes this graph has left open."""

        graph = GenderNounDataHandler.load_from_web()
        lwarn("")
        graph = GenderNounDataHandler.remove_words_that_are_not_nouns(graph)

        lwarn("")
        graph = GenderNounDataHandler.make_sure_all_referenced_words_exist(graph)
        lwarn("")
        graph = GenderNounDataHandler.make_all_links_two_sided(graph)

        lwarn("")
        graph = GenderNounDataHandler.create_extra_links_to_gender_ambiguous_words(graph)
        lwarn("")
        graph = GenderNounDataHandler.make_all_links_two_sided(graph, log_clashs=True)

        lwarn("")
        graph = GenderNounDataHandler.create_gendered_versions_for_words_that_end_with_gender_indicators(graph)
        lwarn("")
        graph = GenderNounDataHandler.make_all_links_two_sided(graph)

        lwarn("")
        graph = GenderNounDataHandler.find_words_with_no_neutral_form(graph)
        return graph

Static methods

def load_from_web() ‑> Dict[str, GeneratedDataWord]

Creates a JSON object describing the differently gendered versions of every gendered noun. Word with no wordnet_senseno-attribute (that are not from wordnet) are ignored, and words with "other" as gender are re-gendered as "neutral".

The data used for this is taken from https://github.com/ecmonsen/gendered_words (which is not by me; see the repository for the license).

Expand source code Browse git
@staticmethod
def load_from_web() -> GeneratedGenderNounData:
    """Creates a JSON object describing the differently gendered versions of every gendered noun.
    Word with no wordnet_senseno-attribute (that are not from wordnet) are ignored, and words with "other" as gender
    are re-gendered as "neutral".

    The data used for this is taken from https://github.com/ecmonsen/gendered_words
    (which is not by me; see the repository for the license)."""

    grammatical_genders = {"m": "male", "f": "female", "n": "neutral"}

    # load from the web, in a completely wrong format that we have yet to change:
    raw_json: OriginalGenderNounData = json.loads(requests.get(
        "https://raw.githubusercontent.com/phseiff/gendered_words/master/gendered_words.json").text)

    # change the format:
    result = dict()
    for word in raw_json:
        if "wordnet_senseno" in word:
            result[word["word"]] = {"gender_map": dict()}
            if "gender_map" in word:
                for short, long in grammatical_genders.items():
                    if short in word["gender_map"]:
                        result[word["word"]]["gender_map"][long] = word["gender_map"][short][0]["word"].replace(
                            " ", "_")
            if word["gender"] == "o":
                lwarn("Found an \"other\"-word! It's \"" + word["word"] + "\".")
                result[word["word"]]["gender"] = "neutral"
            else:
                result[word["word"]]["gender"] = grammatical_genders[word["gender"]]
        else:
            lwarn("\"" + word["word"] + "\" ignored because it is not part of wordnet and therefore not a hyponyms "
                  + "for a person.")

    lwarn(len(result), "words found.")
    return result
def load_from_disk(file_name: str) ‑> (typing.Dict[str, GeneratedDataWord], typing.Dict[str, str])

Loads the gendered nouns from a piece of json data contained in the given file. The json object in this file must have a data-attribute, which contains the actual gendered noun data, and may contain an arbitrary amount of meta data in the form of strings mapped to strings, which will be returned as a dict as the second return value. The first return value is the actual gendered noun data, with the only change being that the warnings-attribute of every noun is converted from a list to a set. This assumes the gendered noun data to be of the same type as the data used throughout all the other methods of this class, NOT the format of the repository from which load_from_web loads its data.

Expand source code Browse git
@staticmethod
def load_from_disk(file_name: str) -> (GeneratedGenderNounData, Dict[str, str]):
    """Loads the gendered nouns from a piece of json data contained in the given file.
    The json object in this file must have a data-attribute, which contains the actual gendered noun data, and may
    contain an arbitrary amount of meta data in the form of strings mapped to strings, which will be returned as a
    dict as the second return value.
    The first return value is the actual gendered noun data, with the only change being that the
    `warnings`-attribute of every noun is converted from a list to a set.
    This assumes the gendered noun data to be of the same type as the data used throughout all the other methods
    of this class, NOT the format of the repository from which `load_from_web` loads its data."""
    with open(file_name, "r") as f:
        code = f.read()
    meta_data = json.loads(code)
    actual_data = meta_data["data"]
    del meta_data["data"]

    for word_data in actual_data.values():
        if "warning" in word_data:
            word_data["warning"] = set(word_data["warning"])
    return actual_data, meta_data
def save_to_disk(graph: Dict[str, GeneratedDataWord], file_name: str, **meta_data: Dict[str, str]) ‑> NoneType

Saves the given gendered nouns data to the given file as json. The data is saved without any modifications, except for the warnings-attribute of all noun data, which is converted from a set to a list to be compatible with standard json format, in the data-attribute of the resulting json file. meta_data may be used to describe additional meta data in the form of strings of the resulting file. This assumes the given data to be of the same type as the data used throughout all the other methods of this class, NOT the format of the repository from which load_from_web loads its data. The data saved with this method can be read again with load_from_disk, so that save_to_disk(data, file_name) implies load_from_disk(file_name) == data. This method is guaranteed to not change the given data in-place.

Expand source code Browse git
@staticmethod
def save_to_disk(graph: GeneratedGenderNounData, file_name: str, **meta_data: Dict[str, str]) -> None:
    """Saves the given gendered nouns data to the given file as json. The data is saved without any modifications,
    except for the `warnings`-attribute of all noun data, which is converted from a set to a list to be compatible
    with standard json format, in the data-attribute of the resulting json file.
    `meta_data` may be used to describe additional meta data in the form of strings of the resulting file.
    This assumes the given data to be of the same type as the data used throughout all the other methods of this
    class, NOT the format of the repository from which `load_from_web` loads its data.
    The data saved with this method can be read again with `load_from_disk`, so that `save_to_disk(data, file_name)`
    implies `load_from_disk(file_name) == data`.
    This method is guaranteed to not change the given data in-place."""
    graph_copy: dict = copy.deepcopy(graph)
    for word_data in graph_copy.values():
        if "warning" in word_data:
            word_data["warning"] = list(word_data["warning"])
            word_data["warning"].sort()
    resulting_data: dict = copy.deepcopy(meta_data)
    resulting_data["data"] = graph_copy
    with open(file_name, "w") as f:
        json.dump(resulting_data, f, indent=4, sort_keys=True)
def remove_words_that_are_not_nouns(graph: Dict[str, GeneratedDataWord]) ‑> Dict[str, GeneratedDataWord]

Removes all elements that are not nouns from the graph. Words that are not nouns, but whose gendered versions contain nouns for some reason, are not purged by this. Returns the result. This step should usually do nothing, since all words that are not nouns should already be filtered out when the data is read via load_from_web, but this uses an undocumented feature of the online data set it uses, so this function ensures that changes of that feature don't break this code.

This step may or may not change the given object in-place.

Expand source code Browse git
@staticmethod
def remove_words_that_are_not_nouns(graph: GeneratedGenderNounData) -> GeneratedGenderNounData:
    """Removes all elements that are not nouns from the graph. Words that are not nouns, but whose gendered versions
    contain nouns for some reason, are not purged by this.
    Returns the result.
    This step should usually do nothing, since all words that are not nouns should already be filtered out when
    the data is read via load_from_web, but this uses an undocumented feature of the online data set it uses, so
    this function ensures that changes of that feature don't break this code.

    This step may or may not change the given object in-place."""

    grammatical_genders = ["male", "female", "neutral"]

    count = 0
    for word_name, word_data in list(graph.items()):
        is_noun = is_a_noun(word_name)
        for grammatical_gender in grammatical_genders:
            if grammatical_gender in word_data["gender_map"]:
                is_noun = is_noun or is_a_noun(word_data["gender_map"][grammatical_gender])
        if not is_noun:
            lwarn("Deleting \"" + word_name + "\", since it is not a noun!")
            count += 1
            del graph[word_name]

    lwarn(count, "words deleted.")
    return graph
def make_sure_all_referenced_words_exist(graph: Dict[str, GeneratedDataWord]) ‑> Dict[str, GeneratedDataWord]

Returns a version of the graph where every word linked as a differently gendered version of a word exists.

Expand source code Browse git
@staticmethod
def make_sure_all_referenced_words_exist(graph: GeneratedGenderNounData) -> GeneratedGenderNounData:
    """Returns a version of the graph where every word linked as a differently gendered version of a word exists."""

    count = 0
    for word_name, word_data in list(graph.items()):
        for gender_name, link_name in word_data["gender_map"].items():
            if link_name not in graph:
                lwarn("\"" + word_name + "\" lists \"" + link_name + "\" as its " + gender_name + " version, but \""
                      + link_name + "\" does not exist in the word data file.")
                count += 1
                graph[link_name] = {"gender": gender_name, "gender_map": {word_data["gender"]: word_name}}
            # # Commented out, since it is already covered by create_extra_links_to_gender_ambiguous_words():
            # if graph[link_name]["gender"] != gender_name:
            #     lwarn("\"" + link_name + "\" is \"" + word_name + "\"s " + gender_name + "s version, but is not "
            #           + gender_name + ".")

    lwarn(count, "new words created.")
    return graph
def choose_better_word(option1, option2, log=False)

There are cases where a word has two different gendered versions for the same gender; for example, "foremother" and "ancestress" are both female words for an ancestor. In these cases, the "better" one will be chosen according to this pipeline. This is relevant for make_all_links_two_sided, for example (see docstring).

If log is set, having to resort to alphabetical decisions is logged.

Expand source code Browse git
@staticmethod
def choose_better_word(option1, option2, log=False):
    """There are cases where a word has two different gendered versions for the same gender;
    for example, "foremother" and "ancestress" are both female words for an ancestor. In these cases, the "better"
    one will be chosen according to this pipeline.
    This is relevant for `make_all_links_two_sided`, for example (see docstring).

    If `log` is set, having to resort to alphabetical decisions is logged."""

    # ToDo: Add further rules to this, or better yet, make manual (no automated data!) PRs to
    #  https://github.com/phseiff/gendered-words to clear up all these unnecessary unclearnesses.
    bad_things_in_order_of_badness: List[Callable[[str], int]] = [
        # the sooner the lambda, the worse it is:
        lambda word: len([c for c in word if c == "_"]),
        lambda word: int(word.endswith("man") or word.endswith("woman")),
        lambda word: int(word.endswith("person")),
        lambda word: int(word.endswith("mother") or word.endswith("father")),
        lambda word: int(word.endswith("parent"))
    ]
    for badness_meter in bad_things_in_order_of_badness:
        if badness_meter(option1) > badness_meter(option2):
            return option2
        elif badness_meter(option2) > badness_meter(option1):
            return option1
    # otherwise, return the alphabetically first one:
    if log:
        lwarn("Had to alphabetically decide between", option1, "and", option2)
    return sorted([option1, option2])[0]

Returns a version of the graph where every word linked to links back to the word linking to it, if this words gender is not yet in its gender mapping. This also goes for triangles where a links to b and c, but b and c are not linked.

If log_clashs is set to True, clashes are logged (these are cases where there are multiple versions available for the gendered version of a word). This should only be enabled before create_gendered_versions_for_words_that_end_with_gender_indicators is called on the data for the first time, since afterwards, multiple words might be connected that wheren't conntected before. One example would be that "wonder_woman" and "wonder_girl" are both considered female, and both will get "wonder_person" assigned as their neutral version by create_gendered_versions_for_words_that_end_with_gender_indicators. This is not an issue since one of them already links to "wonder_boy", and the other one to "wonder_man" from the beginning, but they will still be considered clashing by the algorithm, so calling make_all_links_two_sided with log_clashs set to True after applying create_gendered_versions_for_words_that_end_with_gender_indicators to the data will create logs for two words that don't clash more than linguistically necessary.

If infect_warnings isn't explicitely set to False, all warnings added to a word are "passed" to every word it's linked to.

Expand source code Browse git
@staticmethod
def make_all_links_two_sided(graph: GeneratedGenderNounData, log_clashs=False)\
        -> GeneratedGenderNounData:
    """Returns a version of the graph where every word linked to links back to the word linking to it, if this words
    gender is not yet in its gender mapping.
    This also goes for triangles where a links to b and c, but b and c are not linked.

    If `log_clashs` is set to True, clashes are logged (these are cases where there are multiple versions available
    for the gendered version of a word). This should only be enabled before
    `create_gendered_versions_for_words_that_end_with_gender_indicators` is called on the data for the first time,
    since afterwards, multiple words might be connected that wheren't conntected before. One example would be that
    "wonder_woman" and "wonder_girl" are both considered female, and both will get "wonder_person" assigned as their
    neutral version by `create_gendered_versions_for_words_that_end_with_gender_indicators`. This is not an issue
    since one of them already links to "wonder_boy", and the other one to "wonder_man" from the beginning, but
    they will still be considered clashing by the algorithm, so calling `make_all_links_two_sided` with
    `log_clashs` set to True after applying `create_gendered_versions_for_words_that_end_with_gender_indicators` to
     the data will create logs for two words that don't clash more than linguistically necessary.

     If `infect_warnings` isn't explicitely set to False, all warnings added to a word are "passed" to every word
     it's linked to."""

    count = 0

    # makes sure we do not report a clash twice.
    already_reported_clashes: Set[str] = set()

    def assign_value_or_use_old_one(gender_dict, gender, value):
        """Assigns a gendered version (`value`) to the given `gender` in the `gender_dict`.
        If there is already a value for the given gender, a decision is made whether to replace the old value in the
        gender dict with the newfound alternative one, and we then log the decision if `log_clashs` is set, and
        collect information required to build a warning to attach to the word to indicate that there was an
        algorithmic choose between two options."""
        if gender in gender_dict and gender_dict[gender] != value:
            # log the clash if it wasn't logged yet:
            clash_report = value + " clashes with " + gender_dict[gender]
            if log_clashs and clash_report not in already_reported_clashes:
                lwarn(clash_report)
                already_reported_clashes.add(clash_report)

            # choose the better of both options (keep the old word or rather use the new one):
            chosen_option = GenderNounDataHandler.choose_better_word(value, gender_dict[gender], log=log_clashs)
        else:
            chosen_option = value

        gender_dict[gender] = chosen_option
        gender_dict_alts[gender] |= {chosen_option, value}

    def get_all_connected_words(word_name, data: Union[Set[str], None] = None):
        """Returns a set of all words that are indirectly or directly linked to the given word."""
        if not data:
            data = {word_name}
        for link_name in graph[word_name]["gender_map"].values():
            if link_name not in data:
                data.add(link_name)
                get_all_connected_words(link_name, data)
            if link_name in link_groups:
                data |= link_groups[link_name]
        return data

    link_groups: Dict[str, Set[str]] = dict()
    words_we_already_visited: Set[str] = set()
    # first iteration is to make FULL links groups of connected words before we start linking words to each other:
    for word_name, word_data in graph.items():
        if word_name not in words_we_already_visited:

            # create group of all words that are linked to this word (including those that are once removed):
            link_group = get_all_connected_words(word_name)
            for link_name in link_group:
                if link_name in link_groups:
                    link_groups[link_name] |= link_group
                else:
                    link_groups[link_name] = link_group

            # add these words to the visited words now:
            words_we_already_visited |= link_group

    # second iteration is to actually do the linking:
    words_we_already_visited: Set[str] = set()
    for word_name in graph.keys():
        # if we did not visit this word yet:
        if word_name not in words_we_already_visited:
            link_group = link_groups[word_name]
            words_we_already_visited |= link_group

            # create a gender-dict for all of them:
            genders_we_are_very_sure_about: Set[GeneratedDataGender] = set()
            gender_dict: Dict[GeneratedDataGender, str] = dict()
            gender_dict_alts: Dict[GeneratedDataGender, Set[str]] = {"female": set(), "male": set(),
                                                                     "neutral": set()}
            for link_name in link_group:
                # add gender of the individual...
                gender = graph[link_name]["gender"]
                if gender in genders_we_are_very_sure_about:
                    assign_value_or_use_old_one(gender_dict, gender, link_name)
                else:
                    gender_dict_alts[gender] = {link_name}
                    gender_dict[gender] = link_name
                genders_we_are_very_sure_about.add(gender)

                # ...as well as genders it specifies in its gender_map, but only if we couldn't find these otherwise
                for gender, link_name2 in graph[link_name]["gender_map"].items():
                    if gender not in genders_we_are_very_sure_about:
                        assign_value_or_use_old_one(gender_dict, gender, link_name2)

            # link words with each other, so each word gets the full gender dict to call its own,
            # but only for genders it does not have in its own original gender dict yet, so manual links in the
            # already given data take precedence:
            for link_name in link_group:
                for gender, link_name2 in gender_dict.items():
                    if gender not in graph[link_name]["gender_map"] and gender != graph[link_name]["gender"]:
                        graph[link_name]["gender_map"][gender] = link_name2
                        lwarn("\"" + link_name + "\" is (indirectly) linked to \"" + link_name2
                              + "\", which is " + graph[link_name2]["gender"], " but \"" + link_name
                              + "\" has no " + graph[link_name2]["gender"] + " version.")
                        count += 1

                        # if the newly added link was chosen between two possible words algorithmically, add this as
                        # a warning:
                        if gender_dict_alts[gender] != {link_name2}:
                            if "warning" not in graph[link_name]:
                                graph[link_name]["warning"] = set()
                            alt_values = sorted(list(gender_dict_alts[gender]))
                            alt_values_str = ", ".join(alt_values[:-1]) + " and " + alt_values[-1]
                            new_warning = (alt_values_str + " would've all been good values for the " + gender + " "
                                           + "version of \"" + link_name + "\", but \"" + link_name2
                                           + "\" was automatically chosen based on an algorithm.")
                            lwarn(new_warning)
                            graph[link_name]["warning"].add(new_warning)

            # infect linked words with the warnings of the words they're linked to:
            link_group_warnings = set()
            for link_name in link_group:
                if "warning" in graph[link_name]:
                    link_group_warnings |= graph[link_name]["warning"]
            if link_group_warnings:
                for link_name in link_group:
                    if "warning" not in graph[link_name]:
                        graph[link_name]["warning"] = set()
                    graph[link_name]["warning"] |= link_group_warnings

    lwarn(count, "links created.")
    return graph

There may be some words A with gender x, that list another word B as their gender-y-version, but don't have any gender-z-version, and word B is listet as gender z for some reason. This method returns a version of the given graph where A links to B as its gender-z-version.

Expand source code Browse git
@staticmethod
def create_extra_links_to_gender_ambiguous_words(graph: GeneratedGenderNounData) -> GeneratedGenderNounData:
    """There may be some words A with gender x, that list another word B as their gender-y-version, but don't have
    any gender-z-version, and word B is listet as gender z for some reason.
    This method returns a version of the given graph where A links to B as its gender-z-version."""

    count = 0
    for word_name, word_data in graph.items():
        for gender_name, link_name in list(word_data["gender_map"].items()):
            if graph[link_name]["gender"] not in word_data["gender_map"]:
                if graph[link_name]["gender"] != word_data["gender"]:
                    word_data["gender_map"][graph[link_name]["gender"]] = link_name
                    lwarn("\"" + word_name + "\" does not have a " + graph[link_name]["gender"] + " version, but a "
                          + "word it links to as its " + gender_name + " version is " + graph[link_name]["gender"]
                          + ".")
                    count += 1

    lwarn(count, "links created.")
    return graph
def create_gendered_versions_for_words_that_end_with_gender_indicators(graph: Dict[str, GeneratedDataWord]) ‑> Dict[str, GeneratedDataWord]

Some words end on "-man" or "-woman", or similar things like "boy" or "maid", but don't have a male/female/ neutral version. This method returns a version of the graph where every word of these has a male, female as well as neutral version. It also ensures that the resulting word doesn't end with "_" due to having its end or beginning removed, and that it doesn't create "empty" words.

Expand source code Browse git
@staticmethod
def create_gendered_versions_for_words_that_end_with_gender_indicators(graph: GeneratedGenderNounData)\
        -> GeneratedGenderNounData:
    """Some words end on "-man" or "-woman", or similar things like "boy" or "maid", but don't have a male/female/
    neutral version. This method returns a version of the graph where every word of these has a male, female as well
    as neutral version. It also ensures that the resulting word doesn't end with "_" due to having its end or
    beginning removed, and that it doesn't create "empty" words."""

    # ToDo: better suggestions regarding the maid/maiden/boy/girl-stuff are welcome!
    # ToDo: running replacement strategies like this when creating a tag with a GenderedNoun-object for an unknown
    #  noun might be a good idea... this would, however, require figuring out a words gender, so the table we have
    #  here would have to be accessible outside this function.
    #  Feel free to submit a pull request for this, or an issue if you see this fitting!
    gender_indicator_tuples_table = [
        ("start", [("female", "female"),   ("male", "male"),       ("neutral", "")]),

        ("end",   [("female", "woman"),    ("male", "man"),        ("neutral", "person")]),
        ("end",   [("female", "girl"),     ("male", "boy"),        ("neutral", "bean")]),
        ("end",   [("female", "maiden"),   ("male", "gentleman"),  ("neutral", "gentleperson")]),
        ("end",   [("female", "maid"),     ("male", "manservant"), ("neutral", "servant")]),  # or butler? domestic?
        ("end",   [("female", "aunt"),     ("male", "uncle"),      ("neutral", "auncle")]),
        ("end",   [("female", "daughter"), ("male", "son"),        ("neutral", "child")]),
        ("end",   [("female", "mother"),   ("male", "father"),     ("neutral", "parent")]),
        ("end",   [("female", "wife"),     ("male", "husband"),    ("neutral", "spouse")]),
        ("end",   [("female", "niece"),    ("male", "nephew"),     ("neutral", "nibling")]),
        ("end",   [("female", "female"),   ("male", "male"),       ("neutral", "person")]),
        ("end",   [("female", "sister"),   ("male", "brother"),    ("neutral", "sibling")]),
        ("end",   [("female", "queen"),    ("male", "king"),       ("neutral", "monarch")]),

        ("start", [("female", "woman"),    ("male", "man"),        ("neutral", "person")]),
        ("start", [("female", "girl"),     ("male", "boy"),        ("neutral", "bean")]),
        ("start", [("female", "maiden"),   ("male", "gentleman"),  ("neutral", "gentleperson")]),
        ("start", [("female", "maid"),     ("male", "manservant"), ("neutral", "servant")]),
        ("start", [("female", "aunt"),     ("male", "uncle"),      ("neutral", "auncle")]),
        ("start", [("female", "daughter"), ("male", "son"),        ("neutral", "child")]),
        ("start", [("female", "mother"),   ("male", "father"),     ("neutral", "parent")]),
        ("start", [("female", "wife"),     ("male", "husband"),    ("neutral", "spouse")]),
        ("start", [("female", "niece"),    ("male", "nephew"),     ("neutral", "nibling")]),
        ("start", [("female", "female"),   ("male", "male"),       ("neutral", "person")]),
        ("start", [("female", "sister"),   ("male", "brother"),    ("neutral", "sibling")]),
        ("start", [("female", "queen"),    ("male", "king"),       ("neutral", "monarch")])
    ]

    words_created = 0
    links_created = 0
    for word_name, word_data in list(graph.items()):
        for end_or_start, gender_indicator_tuples in gender_indicator_tuples_table:
            created_corresponding_gendered_versions = False
            for gender, gender_indicator in gender_indicator_tuples:
                if (((end_or_start == "end" and word_name.endswith(gender_indicator))
                        or (end_or_start == "start" and word_name.startswith(gender_indicator)))
                        and gender_indicator):
                    not_applicable = False
                    if gender != "neutral":
                        other_gender_indicator_tuples = [
                            t for t in gender_indicator_tuples if t != (gender, gender_indicator)]
                        for other_gender, other_gender_indicator in other_gender_indicator_tuples:
                            if other_gender not in set(word_data["gender_map"].keys()) | {word_data["gender"]}:
                                # figure out the potential new word:
                                if end_or_start == "end":
                                    new_gendered_version = (word_name[:-len(gender_indicator)]
                                                            + other_gender_indicator)
                                else:
                                    new_gendered_version = (other_gender_indicator
                                                            + word_name[len(gender_indicator):])
                                new_gendered_version = new_gendered_version.lstrip("_").rstrip("_")
                                if new_gendered_version == "":
                                    not_applicable = True
                                    continue
                                # create the new word or link to it:
                                lwarn("\"" + word_name + "\" ends with \"-" + gender_indicator
                                      + "\", but it has no " + other_gender + " version. ", end="")
                                word_data["gender_map"][other_gender] = new_gendered_version
                                links_created += 1
                                if new_gendered_version not in graph:
                                    lwarn("Creating one as \"" + new_gendered_version + "\"!")
                                    words_created += 1
                                    graph[new_gendered_version] = {
                                        "gender": other_gender,
                                        "gender_map": dict(),
                                        "warning": {"\"" + new_gendered_version
                                                    + "\" was automatically generated as the " + other_gender
                                                    + " version of a word due to the "
                                                    + ("ending" if end_or_start == "end" else "beginning")
                                                    + " of said " + "word."}
                                    }
                                else:
                                    lwarn("Linking to \"" + new_gendered_version + "\".")
                                # Add a new warning that a link was created between those two:
                                if "warning" not in word_data:
                                    word_data["warning"] = set()
                                word_data["warning"].add("\"" + new_gendered_version
                                                         + "\" was automatically linked to \"" + word_name
                                                         + "\" as its " + gender + " version due to the "
                                                         + ("ending" if end_or_start == "end" else "beginning")
                                                         + " of said " + "word.")

                    if not not_applicable:
                        created_corresponding_gendered_versions = True
                        break
            if created_corresponding_gendered_versions:
                break

    lwarn(words_created, "new words created.")
    lwarn(links_created, "new links created.")
    return graph
def find_words_with_no_neutral_form(graph: Dict[str, GeneratedDataWord]) ‑> Dict[str, GeneratedDataWord]

Informs about every word that does not have a neutral version, and uses this word as its own neutral version. Gives each of these words a neutral form by using the male version of the word by default and the female version if there is no male one.

Expand source code Browse git
@staticmethod
def find_words_with_no_neutral_form(graph: GeneratedGenderNounData) -> GeneratedGenderNounData:
    """Informs about every word that does not have a neutral version, and uses this word as its own neutral version.
    Gives each of these words a neutral form by using the male version of the word by default and the female version
    if there is no male one."""

    count_male = 0
    count_used_male = 0
    count_used_female = 0
    count = 0
    for word_name, word_data in list(graph.items()):
        if word_data["gender"] != "neutral":
            if "neutral" not in word_data["gender_map"]:
                lwarn("\"" + word_name + "\" is neither neutral, nor does it link to a neutral version.")
                count += 1
                if word_data["gender"] == "male":
                    word_data["gender_map"]["neutral"] = word_name
                    if "warning" not in word_data:
                        word_data["warning"] = set()
                    word_data["warning"].add("\"" + word_name + "\" does not have an explicit, human-picked "
                                             + "neutral version, and since it is male, it's automatically used as "
                                             + "own neutral version. This is done because lots of words in the "
                                             + "database are wrongly marked as male even though they are actually "
                                             + "neutral.")
                    count_male += 1
                elif "male" in word_data["gender_map"]:
                    word_data["gender_map"]["neutral"] = word_data["gender_map"]["male"]
                    if "warning" not in word_data:
                        word_data["warning"] = set()
                    word_data["warning"].add("\"" + word_name + "\" does not have an explicit, human-picked "
                                             + "neutral version, and since it has a male version, this version is "
                                             + "automatically used as its neutral version. This is done because "
                                             + "lots of words in the  database are wrongly marked as male even "
                                             + "though they are actually neutral.")
                    count_used_male += 1
                else:
                    if "warning" not in word_data:
                        word_data["warning"] = set()
                    word_data["gender_map"]["neutral"] = word_name
                    word_data["warning"].add("\"" + word_name + "\" neither has a male nor a female version, so "
                                             + "it is used as its own neutral version.")
                    count_used_female += 1
                lwarn(word_data["warning"])

    lwarn(count, "instances found.")
    lwarn(count_male, "instances where male words where used as their own neutral version,")
    lwarn(count_used_male, "instances where a male version of a word was used as its neutral version,")
    lwarn(count_used_female, "instances where a female word was used as its own neutral version.")

    return graph
def create_full_graph_from_web() ‑> Dict[str, GeneratedDataWord]

A pipeline that combines all methods of this method collection to pull a graph of gendered words from the web and automatically fill all holes this graph has left open.

Expand source code Browse git
@staticmethod
def create_full_graph_from_web() -> GeneratedGenderNounData:
    """A pipeline that combines all methods of this method collection to pull a graph of gendered words from the web
    and automatically fill all holes this graph has left open."""

    graph = GenderNounDataHandler.load_from_web()
    lwarn("")
    graph = GenderNounDataHandler.remove_words_that_are_not_nouns(graph)

    lwarn("")
    graph = GenderNounDataHandler.make_sure_all_referenced_words_exist(graph)
    lwarn("")
    graph = GenderNounDataHandler.make_all_links_two_sided(graph)

    lwarn("")
    graph = GenderNounDataHandler.create_extra_links_to_gender_ambiguous_words(graph)
    lwarn("")
    graph = GenderNounDataHandler.make_all_links_two_sided(graph, log_clashs=True)

    lwarn("")
    graph = GenderNounDataHandler.create_gendered_versions_for_words_that_end_with_gender_indicators(graph)
    lwarn("")
    graph = GenderNounDataHandler.make_all_links_two_sided(graph)

    lwarn("")
    graph = GenderNounDataHandler.find_words_with_no_neutral_form(graph)
    return graph
class GenderedNoun (word: str)

A representation of a gendered noun, with methods to get gendered equivalents of it.

Generates an object to get gendered versions of the given noun for different genders.

Expand source code Browse git
class GenderedNoun:
    """A representation of a gendered noun, with methods to get gendered equivalents of it."""

    def __init__(self, word: str):
        """Generates an object to get gendered versions of the given noun for different genders."""

        # save the full word, but lookup the word in lowercase:
        self.word = word

        # raise warnings if the word is not a word/ noun/ person noun:
        if word not in GENDER_DICT:
            if not is_a_word(word):
                warnings.WarningManager.raise_warning("\"" + word + "\" is not a known word, so gender*render might not"
                                                      + " be able to gender it correctly.", warnings.NotAWordWarning)
            elif not is_a_noun(word):
                warnings.WarningManager.raise_warning("\"" + word + "\" is not a known noun, so gender*render might not"
                                                      + " be able to gender it correctly.", warnings.NotANounWarning)
            else:
                warnings.WarningManager.raise_warning("\"" + word + "\" is not a hyponym for person, so gender*render "
                                                      + "might not be able to gender it correctly.",
                                                      warnings.NotAPersonNounWarning)
        elif "warning" in GENDER_DICT[word]:
            warnings.WarningManager.raise_warning("warnings for \"" + word + "\":\n"
                                                  + "\n".join(list(GENDER_DICT[word]["warning"])),
                                                  warnings.NounGenderingGuessingsWarning)
            # ToDo: Maybe only print those warnings that contain `"\"" + word + "\""` in them? This would require
            #  reviewing all warnings attached to words by this modules code, to be sure this actually prints all
            #  relevant warnings, as well as injecting some trivial code here and generally discussing this idea in an
            #  issue.
            #  See also the comment in test/test_gender_nouns in test_create_full_graph_from_web.

    def render_noun(self, gender: GeneratedDataGender) -> str:
        """Returns the correctly gendered version of itself as a string. gender must be either "male", "female" or
        "neutral"."""  # ToDo: Re-test this since capitalization is no longer supported.

        # return the correctly gendered version of the word:
        word = self.word
        if word in GENDER_DICT:
            word_data = GENDER_DICT[word]
            # look for the neutral version if there is no version of the given gender:
            if gender not in set(word_data["gender_map"].keys()) | {word_data["gender"]}:
                gender = "neutral"
            # return the word if it is the right gender:
            if GENDER_DICT[word]["gender"] == gender:
                result = word
            # otherwise, return the correctly gendered version from the gender_map:
            else:
                result = word_data["gender_map"][gender]
        else:
            result = self.word
        return result.replace("_", " ")

    def __eq__(self, other) -> bool:
        """Checks whether two GenderedNoun-representations are identical, based on what noun they represent."""
        if isinstance(other, self.__class__):
            return self.word == other.word
        else:
            return False

Methods

def render_noun(self, gender: typing_extensions.Literal['female', 'male', 'neutral']) ‑> str

Returns the correctly gendered version of itself as a string. gender must be either "male", "female" or "neutral".

Expand source code Browse git
def render_noun(self, gender: GeneratedDataGender) -> str:
    """Returns the correctly gendered version of itself as a string. gender must be either "male", "female" or
    "neutral"."""  # ToDo: Re-test this since capitalization is no longer supported.

    # return the correctly gendered version of the word:
    word = self.word
    if word in GENDER_DICT:
        word_data = GENDER_DICT[word]
        # look for the neutral version if there is no version of the given gender:
        if gender not in set(word_data["gender_map"].keys()) | {word_data["gender"]}:
            gender = "neutral"
        # return the word if it is the right gender:
        if GENDER_DICT[word]["gender"] == gender:
            result = word
        # otherwise, return the correctly gendered version from the gender_map:
        else:
            result = word_data["gender_map"][gender]
    else:
        result = self.word
    return result.replace("_", " ")