pagemeta docs

InspectedURL

InspectedURL is a dataclass whose function is to extract relevant metadata via SiteHeaders and PageMeta.

Source code in pagemeta/main.py

Python
@dataclass
class InspectedURL:
    url: str
    headers: SiteHeaders | None = None
    meta: PageMeta | None = None

    def __post_init__(self):
        r = httpx.get(self.url, follow_redirects=True)
        self.headers = SiteHeaders.from_raw_headers(data=r.headers)
        self.meta = PageMeta.from_soup(BeautifulSoup(r.content, "html.parser"))

    def prep(self, obj) -> dict:
        return asdict(obj) if obj is not None else {}

    @property
    def url_data(self) -> dict:
        parsed = urlparse(self.url)
        return {k: getattr(parsed, k) for k in parsed._fields}

    @property
    def export(self):
        return self.url_data | self.prep(self.headers) | self.prep(self.meta)

SiteHeaders

SiteHeaders is a dataclass whose function is to use httpx.Response and extract relevant metadata (last-modified, content-type, etc.).

Source code in pagemeta/headers.py

Python
@dataclass
class SiteHeaders:
    content_type: str
    etag: str
    last_modified: datetime.datetime
    date: datetime.datetime

    @classmethod
    def from_raw_headers(cls, data: httpx.Headers) -> Self:
        return cls(
            content_type=data["content-type"],
            etag=data["etag"],
            last_modified=parse(data["last-modified"]),
            date=parse(data["date"]),
        )

PageMeta

PageMeta is a dataclass whose function is to httpx.get a given URL's metadata (title, description, open graph image) with BeautifulSoup.

Extract generic website metadata based on a url fetched on a certain date.

All of the fields, except the date, default to None.

Field	Type	Description
title	str	First matching title parsed from `<meta>` CSS selectors (and the `<title>` tag)
description	str	First matching description Parsed from `<meta>` CSS selectors
author	str	Either the author or the creator, if the author is absent
image	str	An open graph (OG) image url detected
category	str	A type detected from OG ("og:type") values

Source code in pagemeta/meta.py

Python
@dataclass
class PageMeta:
    """Extract generic website metadata based on a url fetched on a certain date.

    All of the fields, except the date, default to `None`.

    Field | Type | Description
    :--:|:--:|:--
    title | str | First matching title parsed from `<meta>` CSS selectors (and the `<title>` tag)
    description | str | First matching description Parsed from `<meta>` CSS selectors
    author | str | Either the author or the creator, if the author is absent
    image | str | An [open graph](https://ogp.me/) (OG) image url detected
    category | str | A type detected from OG ("og:type") values
    """  # noqa: E501

    title: str | None = None
    description: str | None = None
    author: str | None = None
    image: str | None = None
    category: str | None = None

    @classmethod
    def from_soup(cls, soup: BeautifulSoup):
        return cls(
            title=cls.select(soup, TITLE),
            description=cls.select(soup, DESC),
            author=cls.select(soup, AUTHOR),
            image=cls.select(soup, IMG),
            category=cls.select(soup, TYPE),
        )

    @classmethod
    def select(
        cls, soup: BeautifulSoup, selectors: Iterable[str]
    ) -> str | None:
        """The order of CSS selectors. The first one
        matched, retrieves the content, if found.

        See present list of selectors used to extract content:

        ```py
        TITLE = (
            'meta[name="twitter:title"]',
            'meta[property="og:title"]',
            "title",
        )
        DESC = (
            'meta[name="twitter:description"]',
            'meta[property="og:description"]',
            'meta[name="description"]',
        )
        IMG = (
            'meta[name="twitter:image"]',
            'meta[property="og:image"]',
        )
        AUTHOR = (
            'meta[name="author"]',
            'meta[name="twitter:creator"]',
        )
        TYPE = ('meta[property="og:type"]',)
        ```

        Note the special rule on `title` as a selector.

        Examples:
            >>> from pathlib import Path
            >>> html = Path(__file__).parent.parent / "tests" / "data" / "test.html"
            >>> soup = BeautifulSoup(html.read_text(), "html.parser")
            >>> PageMeta.select(soup, TITLE)
            'Hello World From Twitter Title!'
            >>> PageMeta.select(soup, DESC)
            'this is a description from twitter:desc'

        Args:
            soup (BeautifulSoup): Converted html content into a soup object
            selectors (Iterable[str]): CSS selectors as a tuple

        Returns:
            str | None: If found, return the text value.
        """
        for selector in selectors:
            if selector.startswith("meta"):
                if desc := soup.select(selector):
                    if content := desc[0].get("content"):
                        if content and isinstance(content, str):
                            return content
            elif selector == "title":
                if titles := soup("title"):
                    return titles[0].get_text()
        return None

Functions

`select(soup, selectors)` `classmethod`

The order of CSS selectors. The first one matched, retrieves the content, if found.

See present list of selectors used to extract content:

Python

TITLE = (
    'meta[name="twitter:title"]',
    'meta[property="og:title"]',
    "title",
)
DESC = (
    'meta[name="twitter:description"]',
    'meta[property="og:description"]',
    'meta[name="description"]',
)
IMG = (
    'meta[name="twitter:image"]',
    'meta[property="og:image"]',
)
AUTHOR = (
    'meta[name="author"]',
    'meta[name="twitter:creator"]',
)
TYPE = ('meta[property="og:type"]',)

Note the special rule on title as a selector.

Examples:

Python Console Session

>>> from pathlib import Path
>>> html = Path(__file__).parent.parent / "tests" / "data" / "test.html"
>>> soup = BeautifulSoup(html.read_text(), "html.parser")
>>> PageMeta.select(soup, TITLE)
'Hello World From Twitter Title!'
>>> PageMeta.select(soup, DESC)
'this is a description from twitter:desc'

Parameters:

Name	Type	Description	Default
`soup`	`BeautifulSoup`	Converted html content into a soup object	required
`selectors`	`Iterable[str]`	CSS selectors as a tuple	required

Returns:

Type	Description
`str \| None`	str \| None: If found, return the text value.