Skip to content

API

CitableDocument

A Citation's extract_citations() function relies on a CitableDocument.

Creates three main reusable lists:

list concept
@docketed_reports list of DocketReportCitation found in the text, excluding exceptional statutory dockets
@reports list of Report found in the text (which may already be included in @docketed_reports)
@undocketed_reports = @docketed_reports - @reports

Examples:

Python Console Session
>>> text_statutes = "Bar Matter No. 803, Jan. 1, 2000; Bar Matter No. 411, Feb. 1, 2000"
>>> len(CitableDocument(text=text_statutes).docketed_reports) # no citations, since these are 'statutory dockets'
0
>>> text_cites = "374 Phil. 1, 10-11 (1999) 1111 SCRA 1111; G.R. No. 147033, April 30, 2003; G.R. No. 147033, April 30, 2003, 374 Phil. 1, 600; ABC v. XYZ, G.R. Nos. 138570, 138572, 138587, 138680, 138698, October 10, 2000, 342 SCRA 449;  XXX, G.R. No. 31711, Sept. 30, 1971, 35 SCRA 190; Hello World, 1111 SCRA 1111; Y v. Z, 35 SCRA 190;"
>>> doc1 = CitableDocument(text=text_cites)
>>> len(doc1.docketed_reports)
4
>>> doc1.undocketed_reports
{'1111 SCRA 1111'}
>>> text = "<em>Gatchalian Promotions Talent Pool, Inc. v. Atty. Naldoza</em>, 374 Phil. 1, 10-11 (1999), citing: <em>In re Almacen</em>, 31 SCRA 562, 600 (1970).; People v. Umayam, G.R. No. 147033, April 30, 2003; <i>Bagong Alyansang Makabayan v. Zamora,</i> G.R. Nos. 138570, 138572, 138587, 138680, 138698, October 10, 2000, 342 SCRA 449; Villegas <em>v.</em> Subido, G.R. No. 31711, Sept. 30, 1971, 41 SCRA 190;"
>>> doc2 = CitableDocument(text=text)
>>> set(doc2.get_citations()) == {'GR No. 147033, Apr. 30, 2003', 'GR No. 138570, Oct. 10, 2000, 342 SCRA 449', 'GR No. 31711, Sep. 30, 1971, 41 SCRA 190', '374 Phil. 1', '31 SCRA 562'}
True
Source code in src/citation_utils/document.py
Python
@dataclass
class CitableDocument:
    """Creates three main reusable lists:

    list | concept
    :--:|:--:
    `@docketed_reports` | list of `DocketReportCitation` found in the text, excluding exceptional statutory dockets
    `@reports` | list of `Report` found in the text (which may already be included in `@docketed_reports`)
    `@undocketed_reports` | = `@docketed_reports` - `@reports`

    Examples:
        >>> text_statutes = "Bar Matter No. 803, Jan. 1, 2000; Bar Matter No. 411, Feb. 1, 2000"
        >>> len(CitableDocument(text=text_statutes).docketed_reports) # no citations, since these are 'statutory dockets'
        0
        >>> text_cites = "374 Phil. 1, 10-11 (1999) 1111 SCRA 1111; G.R. No. 147033, April 30, 2003; G.R. No. 147033, April 30, 2003, 374 Phil. 1, 600; ABC v. XYZ, G.R. Nos. 138570, 138572, 138587, 138680, 138698, October 10, 2000, 342 SCRA 449;  XXX, G.R. No. 31711, Sept. 30, 1971, 35 SCRA 190; Hello World, 1111 SCRA 1111; Y v. Z, 35 SCRA 190;"
        >>> doc1 = CitableDocument(text=text_cites)
        >>> len(doc1.docketed_reports)
        4
        >>> doc1.undocketed_reports
        {'1111 SCRA 1111'}
        >>> text = "<em>Gatchalian Promotions Talent Pool, Inc. v. Atty. Naldoza</em>, 374 Phil. 1, 10-11 (1999), citing: <em>In re Almacen</em>, 31 SCRA 562, 600 (1970).; People v. Umayam, G.R. No. 147033, April 30, 2003; <i>Bagong Alyansang Makabayan v. Zamora,</i> G.R. Nos. 138570, 138572, 138587, 138680, 138698, October 10, 2000, 342 SCRA 449; Villegas <em>v.</em> Subido, G.R. No. 31711, Sept. 30, 1971, 41 SCRA 190;"
        >>> doc2 = CitableDocument(text=text)
        >>> set(doc2.get_citations()) == {'GR No. 147033, Apr. 30, 2003', 'GR No. 138570, Oct. 10, 2000, 342 SCRA 449', 'GR No. 31711, Sep. 30, 1971, 41 SCRA 190', '374 Phil. 1', '31 SCRA 562'}
        True
    """  # noqa: E501

    text: str

    def __post_init__(self):
        self.text = unicodedata.normalize("NFKD", self.text)
        self.reports = list(Report.extract_reports(self.text))
        self.docketed_reports = list(self.get_docketed_reports(self.text))
        self.undocketed_reports = self.get_undocketed_reports()

    @classmethod
    def get_docketed_reports(
        cls, text: str, exclude_docket_rules: bool = True
    ) -> Iterator[DocketReport]:
        """Extract from `raw` text all raw citations which should include their `Docket` and `Report` component parts.
        This may however include statutory rules since some docket categories like AM and BM use this convention.
        To exclude statutory rules, a flag is included as a default.

        Examples:
            >>> cite = next(CitableDocument.get_docketed_reports("Bagong Alyansang Makabayan v. Zamora, G.R. Nos. 138570, 138572, 138587, 138680, 138698, October 10, 2000, 342 SCRA 449"))
            >>> cite.model_dump(exclude_none=True)
            {'publisher': 'SCRA', 'volume': '342', 'page': '449', 'context': 'G.R. Nos. 138570, 138572, 138587, 138680, 138698', 'category': 'GR', 'ids': '138570, 138572, 138587, 138680, 138698', 'docket_date': datetime.date(2000, 10, 10)}
            >>> statutory_text = "Bar Matter No. 803, Jan. 1, 2000"
            >>> next(CitableDocument.get_docketed_reports(statutory_text)) # default
            Traceback (most recent call last):
                ...
            StopIteration

        Args:
            text (str): Text to look for `Dockets` and `Reports`

        Yields:
            Iterator[DocketReport]: Any of custom `Docket` with `Report` types, e.g. `CitationAC`, etc.
        """  # noqa: E501
        text = unicodedata.normalize("NFKD", text)
        for search_func in (
            CitationAC.search,
            CitationAM.search,
            CitationOCA.search,
            CitationBM.search,
            CitationGR.search,
            CitationPET.search,
            CitationUDK.search,
            CitationJIB.search,
        ):
            # Each search function is applied to the text, each match yielded
            for result in search_func(text):
                if exclude_docket_rules:
                    if is_statutory_rule(result):
                        continue
                    yield result

    def get_undocketed_reports(self):
        """Steps:

        1. From a set of `uniq_reports` (see `self.reports`);
        2. Compare to reports found in `@docketed_reports`
        3. Limit reports to those _without_ an accompaying docket
        """
        uniq_reports = set(Report.get_unique(self.text))
        for cite in self.docketed_reports:
            if cite.volpubpage in uniq_reports:
                uniq_reports.remove(cite.volpubpage)
        return uniq_reports

    def get_citations(self) -> Iterator[str]:
        """There are two main lists to evaluate:

        1. `@docketed_reports` - each includes a `Docket` (optionally attached to a `Report`)
        2. `@reports` - from the same text, just get `Report` objects.

        Can filter out `Report` objects not docketed and thus return
        a more succinct citation list which includes both constructs mentioned above but
        without duplicate `reports`.
        """  # noqa: E501
        if self.docketed_reports:
            for doc_report_cite in self.docketed_reports:
                yield str(doc_report_cite)

            if self.undocketed_reports:
                yield from self.undocketed_reports  # already <str>
        else:
            if self.reports:
                for report in self.reports:
                    yield str(report)

Functions

get_citations()

There are two main lists to evaluate:

  1. @docketed_reports - each includes a Docket (optionally attached to a Report)
  2. @reports - from the same text, just get Report objects.

Can filter out Report objects not docketed and thus return a more succinct citation list which includes both constructs mentioned above but without duplicate reports.

Source code in src/citation_utils/document.py
Python
def get_citations(self) -> Iterator[str]:
    """There are two main lists to evaluate:

    1. `@docketed_reports` - each includes a `Docket` (optionally attached to a `Report`)
    2. `@reports` - from the same text, just get `Report` objects.

    Can filter out `Report` objects not docketed and thus return
    a more succinct citation list which includes both constructs mentioned above but
    without duplicate `reports`.
    """  # noqa: E501
    if self.docketed_reports:
        for doc_report_cite in self.docketed_reports:
            yield str(doc_report_cite)

        if self.undocketed_reports:
            yield from self.undocketed_reports  # already <str>
    else:
        if self.reports:
            for report in self.reports:
                yield str(report)

get_docketed_reports(text, exclude_docket_rules=True) classmethod

Extract from raw text all raw citations which should include their Docket and Report component parts. This may however include statutory rules since some docket categories like AM and BM use this convention. To exclude statutory rules, a flag is included as a default.

Examples:

Python Console Session
>>> cite = next(CitableDocument.get_docketed_reports("Bagong Alyansang Makabayan v. Zamora, G.R. Nos. 138570, 138572, 138587, 138680, 138698, October 10, 2000, 342 SCRA 449"))
>>> cite.model_dump(exclude_none=True)
{'publisher': 'SCRA', 'volume': '342', 'page': '449', 'context': 'G.R. Nos. 138570, 138572, 138587, 138680, 138698', 'category': 'GR', 'ids': '138570, 138572, 138587, 138680, 138698', 'docket_date': datetime.date(2000, 10, 10)}
>>> statutory_text = "Bar Matter No. 803, Jan. 1, 2000"
>>> next(CitableDocument.get_docketed_reports(statutory_text)) # default
Traceback (most recent call last):
    ...
StopIteration

Parameters:

Name Type Description Default
text str

Text to look for Dockets and Reports

required

Yields:

Type Description
DocketReport

Iterator[DocketReport]: Any of custom Docket with Report types, e.g. CitationAC, etc.

Source code in src/citation_utils/document.py
Python
@classmethod
def get_docketed_reports(
    cls, text: str, exclude_docket_rules: bool = True
) -> Iterator[DocketReport]:
    """Extract from `raw` text all raw citations which should include their `Docket` and `Report` component parts.
    This may however include statutory rules since some docket categories like AM and BM use this convention.
    To exclude statutory rules, a flag is included as a default.

    Examples:
        >>> cite = next(CitableDocument.get_docketed_reports("Bagong Alyansang Makabayan v. Zamora, G.R. Nos. 138570, 138572, 138587, 138680, 138698, October 10, 2000, 342 SCRA 449"))
        >>> cite.model_dump(exclude_none=True)
        {'publisher': 'SCRA', 'volume': '342', 'page': '449', 'context': 'G.R. Nos. 138570, 138572, 138587, 138680, 138698', 'category': 'GR', 'ids': '138570, 138572, 138587, 138680, 138698', 'docket_date': datetime.date(2000, 10, 10)}
        >>> statutory_text = "Bar Matter No. 803, Jan. 1, 2000"
        >>> next(CitableDocument.get_docketed_reports(statutory_text)) # default
        Traceback (most recent call last):
            ...
        StopIteration

    Args:
        text (str): Text to look for `Dockets` and `Reports`

    Yields:
        Iterator[DocketReport]: Any of custom `Docket` with `Report` types, e.g. `CitationAC`, etc.
    """  # noqa: E501
    text = unicodedata.normalize("NFKD", text)
    for search_func in (
        CitationAC.search,
        CitationAM.search,
        CitationOCA.search,
        CitationBM.search,
        CitationGR.search,
        CitationPET.search,
        CitationUDK.search,
        CitationJIB.search,
    ):
        # Each search function is applied to the text, each match yielded
        for result in search_func(text):
            if exclude_docket_rules:
                if is_statutory_rule(result):
                    continue
                yield result

get_undocketed_reports()

Steps:

  1. From a set of uniq_reports (see self.reports);
  2. Compare to reports found in @docketed_reports
  3. Limit reports to those without an accompaying docket
Source code in src/citation_utils/document.py
Python
def get_undocketed_reports(self):
    """Steps:

    1. From a set of `uniq_reports` (see `self.reports`);
    2. Compare to reports found in `@docketed_reports`
    3. Limit reports to those _without_ an accompaying docket
    """
    uniq_reports = set(Report.get_unique(self.text))
    for cite in self.docketed_reports:
        if cite.volpubpage in uniq_reports:
            uniq_reports.remove(cite.volpubpage)
    return uniq_reports

Docket Model

Bases: BaseModel

The Docket is the modern identifier of a Supreme Court decision. This data structure however is not the final form of the identifier since that description belongs to the Citation and the CountedCitation.

The purpose of this intermediate structure is that a Docket is often paired with a Report, which is the traditional identifier based on volume and page numbers. The pairing however is not mandatory, thus needed flexibility to create structures with the following combinations of the eventual Citation object:

Citation Docket Report
has both docket and report yes yes
only a docket yes no
only a report no yes

See docket_citation.DocketReportCitation to see structure of paired content.

A Docket is based on a category, a serial id, and a date. Since the serial id may required

Field Type Description
context optional (str) Full text matched by the regex pattern
category optional (DocketCategory) Whether GR, AC, etc.
ids optional (str) The serial number of the docket category
docket_date optional (date) The date associated with the docket
Sample Citation Category Serial Date
G.R. Nos. 138570, October 10, 2000 GR 74910 October 10, 2000
A.M. RTJ-12-2317 (Formerly OCA I.P.I. No. 10-3378-RTJ), Jan 1, 2000 AM RTJ-12-2317 Jan 1, 2000
A.C. No. 10179 (Formerly CBD 11-2985), March 04, 2014 AC 10179 Mar. 4, 2014
Source code in src/citation_utils/dockets/models/docket_model.py
Python
class Docket(BaseModel):
    """
    The `Docket` is the modern identifier of a Supreme Court decision. This data structure
    however is not the final form of the identifier since that description belongs to the `Citation`
    and the `CountedCitation`.

    The purpose of this intermediate structure is that a `Docket` is often paired with a `Report`, which
    is the traditional identifier based on volume and page numbers. The pairing however is not
    mandatory, thus needed flexibility to create structures with the following combinations of
    the eventual Citation object:

    Citation | Docket | Report
    :--:|:--:|:--:
    has both docket and report | yes | yes
    only a docket | yes | no
    only a report | no | yes

    See docket_citation.DocketReportCitation to see structure of paired content.

    A `Docket` is based on a `category`, a `serial id`, and a `date`. Since the serial id
    may required

    Field | Type | Description
    --:|:--:|:--
    `context` | optional (str) | Full text matched by the regex pattern
    `category` | optional (DocketCategory) | Whether GR, AC, etc.
    `ids` | optional (str) | The serial number of the docket category
    `docket_date` | optional (date) | The date associated with the docket

    Sample Citation | Category | Serial | Date
    :-- |:--:|:--:|:--:
    _G.R. Nos. 138570, October 10, 2000_ | GR | 74910 | October 10, 2000
    _A.M. RTJ-12-2317 (Formerly OCA I.P.I. No. 10-3378-RTJ), Jan 1, 2000_ | AM | RTJ-12-2317 |Jan 1, 2000
    _A.C. No. 10179 (Formerly CBD 11-2985), March 04, 2014_ | AC | 10179 | Mar. 4, 2014
    """  # noqa: E501

    context: str = Field(..., description="Full text matched by regex pattern.")
    category: DocketCategory = Field(..., description="e.g. General Register, etc.")
    ids: str = Field(
        ..., description="This may be comma-separated, e.g. '12, 32, and 41'"
    )
    docket_date: date = Field(...)

    def __repr__(self) -> str:
        return f"<Docket: {self.category} {self.serial_text}, {self.formatted_date}>"

    def __str__(self) -> str:
        if self.serial_text:
            return (
                f"{self.category} No. {self.serial_text.upper()}, {self.formatted_date}"  # noqa: E501
            )
        return "No proper string detected."

    def __eq__(self, other: Self) -> bool:
        opt_1 = is_eq(self.category.name, other.category.name)
        opt_2 = is_eq(self.first_id, other.first_id)
        opt_3 = is_eq(self.docket_date.isoformat(), other.docket_date.isoformat())
        return all([opt_1, opt_2, opt_3])

    @property
    def slug(self):
        return "-".join(
            [self.category.name, self.serial_text, self.docket_date.isoformat()]
        )

    @property
    def serial_text(self) -> str:
        """From raw `ids`, get the `cleaned_ids`, and of these `cleaned_ids`,
            extract the `@first_id` found to deal with compound ids, e.g.
            ids separated by 'and' and ','

        Returns:
            str: Singular text identifier
        """
        if x := self.first_id or self.ids:
            x = x.rstrip("*•[]")
            if bits := x.split():
                if len(bits) > 1:
                    if bits[0].isalpha() and not bits[1].startswith("-"):
                        x = f"{bits[0]}-{bits[1]}"
                    elif bits[0].isalpha() and bits[1].startswith("-"):  # mtj -02-1466
                        x = f"{bits[0]}{bits[1]}"
                    elif bits[1].isalpha() and not bits[0].endswith("-"):
                        x = f"{bits[0]}-{bits[1]}"
                    elif bits[1].isalpha() and bits[0].endswith("-"):  # '14061- ret'
                        x = f"{bits[0]}{bits[1]}"
            if adjust := gr_prefix_clean(x):
                return adjust
        return x.split()[0]

    @property
    def first_id(self) -> str:
        """Get first bit from list of separated ids, when possible.

        Returns:
            str: First id found
        """

        def first_exists(char: str, text: str):
            """If a `char` exists in the `text`, split on this value."""
            return text.split(char)[0] if char in text else None

        for char in [" - ", "/", ",", ";", " and ", " AND ", "&"]:
            if res := first_exists(char, self.ids):
                return res
        return self.ids

    @property
    def formatted_date(self) -> str | None:
        if self.docket_date:
            return self.docket_date.strftime(DOCKET_DATE_FORMAT)
        return None

    @classmethod
    def check_serial_num(cls, text: str) -> bool:
        """If a serial number exists, ensure it meets criteria prior to row creation."""
        if DB_SERIAL_NUM.search(text.lower()):
            return True
        return False

    @classmethod
    def clean_serial(cls, text: str) -> str | None:
        """Criteria:

        1. Must be lowercased
        2. Characters that can be included `a-z`, `0-9`, `-`
        3. Must only contain a single alpha-numeric reference

        Args:
            text (str): Raw text to clean

        Returns:
            str: Cleaned serial text fit for database input.
        """
        text = text.lower()
        if " " in text:
            text = text.split()[0]
        if match := DB_SERIAL_NUM.search(text):
            if candidate := match.group("serial"):
                return candidate
        return None

Attributes

first_id: str property

Get first bit from list of separated ids, when possible.

Returns:

Name Type Description
str str

First id found

serial_text: str property

From raw ids, get the cleaned_ids, and of these cleaned_ids, extract the @first_id found to deal with compound ids, e.g. ids separated by 'and' and ','

Returns:

Name Type Description
str str

Singular text identifier

Functions

check_serial_num(text) classmethod

If a serial number exists, ensure it meets criteria prior to row creation.

Source code in src/citation_utils/dockets/models/docket_model.py
Python
@classmethod
def check_serial_num(cls, text: str) -> bool:
    """If a serial number exists, ensure it meets criteria prior to row creation."""
    if DB_SERIAL_NUM.search(text.lower()):
        return True
    return False

clean_serial(text) classmethod

Criteria:

  1. Must be lowercased
  2. Characters that can be included a-z, 0-9, -
  3. Must only contain a single alpha-numeric reference

Parameters:

Name Type Description Default
text str

Raw text to clean

required

Returns:

Name Type Description
str str | None

Cleaned serial text fit for database input.

Source code in src/citation_utils/dockets/models/docket_model.py
Python
@classmethod
def clean_serial(cls, text: str) -> str | None:
    """Criteria:

    1. Must be lowercased
    2. Characters that can be included `a-z`, `0-9`, `-`
    3. Must only contain a single alpha-numeric reference

    Args:
        text (str): Raw text to clean

    Returns:
        str: Cleaned serial text fit for database input.
    """
    text = text.lower()
    if " " in text:
        text = text.split()[0]
    if match := DB_SERIAL_NUM.search(text):
        if candidate := match.group("serial"):
            return candidate
    return None

Docket Category

Docket Category Model

Bases: StrEnum

Common docket references involving Philippine Supreme Court decisions.

Name Value
GR General Register
AM Administrative Matter
AC Administrative Case
BM Bar Matter
PET Presidential Electoral Tribunal
OCA Office of the Court Administrator
JIB Judicial Integrity Board
UDK Undocketed

Complications

Legacy rules

These categories do not always represent decisions. For instance, there are are AM and BM docket numbers that represent rules rather than decisions.

Redocketed numbers

From the Supreme Court Stylebook (p. 159, 2024):

11.3.1. Redocketed numbers

Some cases may have an undocketed (UDK) number and may be redocketed and assigned a General Register (G.R.) number upon payment of the required docket fees. Still other cases may have a docket number starting with OCA IPI or JIB and may be redocketed as Administrative Matters (A.M.), while Commission on Bar Discipline (CBD) cases may be redocketed as Administrative Cases (A.C.). These must still be reflected in all court resolutions, orders, and decisions. x x x

Source code in src/citation_utils/dockets/models/docket_category.py
Python
class DocketCategory(StrEnum):
    """Common docket references involving Philippine Supreme Court decisions.

    Name | Value
    :--|:--
    `GR` | General Register
    `AM` | Administrative Matter
    `AC` | Administrative Case
    `BM` | Bar Matter
    `PET` | Presidential Electoral Tribunal
    `OCA` | Office of the Court Administrator
    `JIB` | Judicial Integrity Board
    `UDK` | Undocketed

    ## Complications

    ### Legacy rules

    These categories do not always represent decisions. For instance,
    there are are `AM` and `BM` docket numbers that represent rules rather
    than decisions.

    ### Redocketed numbers

    From the Supreme Court Stylebook (p. 159, 2024):

    > 11.3.1. Redocketed numbers
    >
    > Some cases may have an undocketed (UDK) number and may be redocketed and assigned a
    General Register (G.R.) number upon payment of the required docket fees. Still other cases may have
    a docket number starting with OCA IPI or JIB and may be redocketed as Administrative Matters (A.M.),
    while Commission on Bar Discipline (CBD) cases may be redocketed as Administrative Cases (A.C.).
    These must still be reflected in all court resolutions, orders, and decisions. x x x
    """

    GR = "General Register"
    AM = "Administrative Matter"
    AC = "Administrative Case"
    BM = "Bar Matter"
    PET = "Presidential Electoral Tribunal"
    OCA = "Office of the Court Administrator"
    JIB = "Judicial Integrity Board"
    UDK = "Undocketed"

    def __str__(self):
        return self.name

    def __repr__(self) -> str:
        """Uses name of member `gr` instead of Enum default
        `<DocketCategory.GR: 'General Register'>`. It becomes to
        use the following conventions:

        Examples:
            >>> DocketCategory['GR']
            'GR'
            >>> DocketCategory.GR
            'GR'

        Returns:
            str: The value of the Enum name
        """
        return str.__repr__(self.name.upper())

Functions

__repr__()

Uses name of member gr instead of Enum default <DocketCategory.GR: 'General Register'>. It becomes to use the following conventions:

Examples:

Python Console Session
>>> DocketCategory['GR']
'GR'
>>> DocketCategory.GR
'GR'

Returns:

Name Type Description
str str

The value of the Enum name

Source code in src/citation_utils/dockets/models/docket_category.py
Python
def __repr__(self) -> str:
    """Uses name of member `gr` instead of Enum default
    `<DocketCategory.GR: 'General Register'>`. It becomes to
    use the following conventions:

    Examples:
        >>> DocketCategory['GR']
        'GR'
        >>> DocketCategory.GR
        'GR'

    Returns:
        str: The value of the Enum name
    """
    return str.__repr__(self.name.upper())

Docket CitationConstructor

Although the different category docket models share a similar configuration, the regex strings involved are different for each, prompting the need for a preparatory constructor class:

Bases: BaseModel

Prefatorily, regex strings are defined so that a re.Pattern object can take advantage of the "group_name" assigned in the string.

These are the docket styles with regex strings predefined:

  1. General Register
  2. Administrative Matter
  3. Administrative Case
  4. Bar Matter
  5. Office of the Court Administrator
  6. Presidential Electoral Tribunal
  7. Judicial Integrity Board
  8. Undocketed Case

The CitationConstructor formalizes the assigned group names into their respective fields.

Relatedly, it takes advantage of the citation_date and the citation_report libraries in generating the main @pattern since the regex strings above are only concerned with the key num id formula part of the docket, e.g. GR No. 123... but not the accompanying date and report.

Source code in src/citation_utils/dockets/models/constructor.py
Python
class CitationConstructor(BaseModel):
    """Prefatorily, regex strings are defined so that a
    `re.Pattern` object can take advantage of the "group_name"
    assigned in the string.

    These are the docket styles with regex strings predefined:

    1. General Register
    2. Administrative Matter
    3. Administrative Case
    4. Bar Matter
    5. Office of the Court Administrator
    6. Presidential Electoral Tribunal
    7. Judicial Integrity Board
    8. Undocketed Case

    The CitationConstructor formalizes the assigned group names into
    their respective fields.

    Relatedly, it takes advantage of
    the `citation_date` and the `citation_report` libraries in
    generating the main `@pattern` since the regex strings above
    are only concerned with the `key` `num` `id` formula part
    of the docket, e.g. `GR` `No.` `123`... but not the accompanying
    date and report.
    """

    label: str = Field(
        ...,
        title="Docket Label",
        description="e.g. General Register, Administrative Matter",
    )
    short_category: str = Field(
        ..., title="Docket Category Shorthand", description="e.g. GR, AM"
    )
    group_name: str = Field(
        ...,
        title="Regex Group Name",
        description=(
            "e.g. 'gr_test_phrase' identifies that portion of the"
            "Match object that should be associated with the label."
        ),
    )
    init_name: str = Field(
        ...,
        title="Regex Initial Group Name",
        description="e.g. gr_mid, am_init; see .regexes for other group names",
    )
    docket_regex: str = Field(
        ...,
        title="Regex Expression Proper",
        description=(
            "The full regex expression which includes the groupnames referred to above."
        ),
    )
    key_regex: str = Field(
        ...,
        title="Regex Key",
        description="Regex portion to get the serial ids",
    )
    num_regex: str = Field(
        ...,
        title="Regex Num",
        description="Regex portion for the num keyword to get the serial ids",
    )

    @property
    def pattern(self) -> re.Pattern:
        """Construct the regex string and generate a full Pattern object from:

        1. `docket_regex`,
        2. `docket_date` defined in the citation-date library
        3. an optional `REPORT_REGEX` defined in the citation-report library

        Returns:
            Pattern: Combination of Docket and Report styles.
        """
        return re.compile(
            "".join(
                [
                    rf"{self.docket_regex}",
                    rf"(?P<extra_phrase>{formerly}?{pp}?){DOCKET_DATE_REGEX}",
                    rf"(?P<opt_report>\,\s*{REPORT_REGEX})?",
                ]
            ),
            re.I | re.X,
        )

    @property
    def key_num_pattern(self) -> re.Pattern:
        """Unlike full @pattern, this regex compiled object is limited to
        just the key and number elements, e.g. "GR No. 123" or "BP Blg. 45"
        """
        regex = rf"{self.key_regex}({self.num_regex})?"
        return re.compile(regex, re.I | re.X)

    def detect(self, raw: str) -> Iterator[dict[str, Any]]:
        """Logic: if `self.init_name` Match group exists, get entire
        regex based on `self.group_name`, extract subgroups which will
        consist of `Docket` and `Report` parts.

        Args:
            raw (str): Text to evaluate

        Yields:
            Iterator[dict[str, Any]]: A dict that can fill up a Docket + Report pydantic BaseModel
        """  # noqa: E501
        for match in self.pattern.finditer(raw):
            if match.group(self.init_name):
                if ctx := match.group(self.group_name).strip(", "):
                    raw_id = cull_extra(self.key_num_pattern.sub("", ctx))
                    ids = raw_id.strip("()[] .,;")
                    raw_date = match.group("docket_date")
                    date_found = decode_date(raw_date, True)
                    if ids and date_found:
                        yield dict(
                            context=ctx,
                            short_category=self.short_category,
                            category=self.label,
                            ids=ids,
                            docket_date=date_found,
                            publisher=get_publisher_label(match),
                            volpubpage=match.group("volpubpage"),
                            volume=match.group("volume"),
                            page=match.group("page"),
                        )

Attributes

key_num_pattern: re.Pattern property

Unlike full @pattern, this regex compiled object is limited to just the key and number elements, e.g. "GR No. 123" or "BP Blg. 45"

pattern: re.Pattern property

Construct the regex string and generate a full Pattern object from:

  1. docket_regex,
  2. docket_date defined in the citation-date library
  3. an optional REPORT_REGEX defined in the citation-report library

Returns:

Name Type Description
Pattern Pattern

Combination of Docket and Report styles.

Functions

detect(raw)

Logic: if self.init_name Match group exists, get entire regex based on self.group_name, extract subgroups which will consist of Docket and Report parts.

Parameters:

Name Type Description Default
raw str

Text to evaluate

required

Yields:

Type Description
dict[str, Any]

Iterator[dict[str, Any]]: A dict that can fill up a Docket + Report pydantic BaseModel

Source code in src/citation_utils/dockets/models/constructor.py
Python
def detect(self, raw: str) -> Iterator[dict[str, Any]]:
    """Logic: if `self.init_name` Match group exists, get entire
    regex based on `self.group_name`, extract subgroups which will
    consist of `Docket` and `Report` parts.

    Args:
        raw (str): Text to evaluate

    Yields:
        Iterator[dict[str, Any]]: A dict that can fill up a Docket + Report pydantic BaseModel
    """  # noqa: E501
    for match in self.pattern.finditer(raw):
        if match.group(self.init_name):
            if ctx := match.group(self.group_name).strip(", "):
                raw_id = cull_extra(self.key_num_pattern.sub("", ctx))
                ids = raw_id.strip("()[] .,;")
                raw_date = match.group("docket_date")
                date_found = decode_date(raw_date, True)
                if ids and date_found:
                    yield dict(
                        context=ctx,
                        short_category=self.short_category,
                        category=self.label,
                        ids=ids,
                        docket_date=date_found,
                        publisher=get_publisher_label(match),
                        volpubpage=match.group("volpubpage"),
                        volume=match.group("volume"),
                        page=match.group("page"),
                    )