Source code for pyutils.url

"""URL utility functions.

This module provides utility functions for working with URLs,
porting JavaScript URL object methods and other URL utilities to Python.
"""

import urllib.parse



[docs]
class URLParser:
    """URL parser class similar to JavaScript URL object.

    Provides methods to parse and manipulate URLs similar to the JavaScript URL API.
    """


[docs]
    def __init__(self, url: str, base: str | None = None):
        """Initialize URL parser.

        Args:
            url: URL string to parse
            base: Base URL for relative URLs

        Examples:
            >>> parser = URLParser('https://example.com/path?query=value#hash')
            >>> parser.hostname
            'example.com'
        """
        if not url:
            raise ValueError("Invalid URL: empty string")

        if base and not urllib.parse.urlparse(url).scheme:
            url = urllib.parse.urljoin(base, url)

        self._parsed = urllib.parse.urlparse(url)

        # Validate that we have at least a scheme
        if not self._parsed.scheme:
            raise ValueError("Invalid URL: missing scheme")

        self._query_params = urllib.parse.parse_qs(self._parsed.query)


    @property
    def href(self) -> str:
        """Get the complete URL.

        Returns:
            Complete URL string
        """
        return self._parsed.geturl()

    @property
    def protocol(self) -> str:
        """Get the protocol (scheme) of the URL.

        Returns:
            Protocol string (e.g., 'https:')
        """
        return f"{self._parsed.scheme}:" if self._parsed.scheme else ""

    @property
    def hostname(self) -> str:
        """Get the hostname of the URL.

        Returns:
            Hostname string
        """
        return self._parsed.hostname or ""

    @property
    def port(self) -> str:
        """Get the port of the URL.

        Returns:
            Port string (empty if default port)
        """
        return str(self._parsed.port) if self._parsed.port else ""

    @property
    def pathname(self) -> str:
        """Get the pathname of the URL.

        Returns:
            Pathname string (defaults to '/' if empty)
        """
        return self._parsed.path or "/"

    @property
    def search(self) -> str:
        """Get the search (query) string of the URL.

        Returns:
            Search string including '?' prefix
        """
        return f"?{self._parsed.query}" if self._parsed.query else ""

    @property
    def hash(self) -> str:
        """Get the hash (fragment) of the URL.

        Returns:
            Hash string including '#' prefix
        """
        return f"#{self._parsed.fragment}" if self._parsed.fragment else ""

    @property
    def origin(self) -> str:
        """Get the origin of the URL.

        Returns:
            Origin string (protocol + hostname + port)
        """
        if not self._parsed.scheme or not self._parsed.hostname:
            return ""
        origin = f"{self._parsed.scheme}://{self._parsed.hostname}"
        if self._parsed.port:
            origin += f":{self._parsed.port}"
        return origin

    @property
    def username(self) -> str:
        """Get the username from the URL.

        Returns:
            Username string
        """
        return self._parsed.username or ""

    @property
    def password(self) -> str:
        """Get the password from the URL.

        Returns:
            Password string
        """
        return self._parsed.password or ""


[docs]
    def get_query_params(self) -> dict[str, str]:
        """Get query parameters as a dictionary.

        Returns:
            Dictionary of query parameters

        Examples:
            >>> parser = URLParser('https://example.com?name=John&age=30')
            >>> params = parser.get_query_params()
            >>> params['name']
            'John'
        """
        if not self._parsed.query:
            return {}
        return dict(urllib.parse.parse_qsl(self._parsed.query, keep_blank_values=True))





[docs]
def parse_url(url: str, base: str | None = None) -> dict[str, str]:
    """Parse a URL string into a dictionary.

    Args:
        url: URL string to parse
        base: Base URL for relative URLs

    Returns:
        Dictionary with URL components

    Examples:
        >>> result = parse_url('https://example.com/path')
        >>> result['hostname']
        'example.com'
    """
    parser = URLParser(url, base)
    return {
        "href": parser.href,
        "protocol": parser.protocol,
        "hostname": parser.hostname,
        "port": parser.port,
        "pathname": parser.pathname,
        "search": parser.search,
        "hash": parser.hash,
        "origin": parser.origin,
        "username": parser.username,
        "password": parser.password,
    }




[docs]
def encode_uri_component(text: str) -> str:
    """Encode a string for use in a URI component (like JavaScript encodeURIComponent).

    Args:
        text: String to encode

    Returns:
        Encoded string

    Examples:
        >>> encode_uri_component('hello world')
        'hello%20world'
        >>> encode_uri_component('user@example.com')
        'user%40example.com'
    """
    return urllib.parse.quote(text, safe="")




[docs]
def decode_uri_component(text: str) -> str:
    """Decode a URI component string (like JavaScript decodeURIComponent).

    Args:
        text: String to decode

    Returns:
        Decoded string

    Raises:
        ValueError: If the string contains invalid percent encoding

    Examples:
        >>> decode_uri_component('hello%20world')
        'hello world'
        >>> decode_uri_component('user%40example.com')
        'user@example.com'
    """
    import re

    # Check for invalid percent encoding patterns
    if re.search(r"%[^0-9A-Fa-f]", text) or re.search(
        r"%[0-9A-Fa-f][^0-9A-Fa-f]", text
    ):
        raise ValueError("Invalid percent encoding")

    try:
        return urllib.parse.unquote(text, errors="strict")
    except UnicodeDecodeError as e:
        raise ValueError(f"Invalid percent encoding: {e}") from e




[docs]
def encode_uri(uri: str) -> str:
    """Encode a complete URI (like JavaScript encodeURI).

    Args:
        uri: URI to encode

    Returns:
        Encoded URI

    Examples:
        >>> encode_uri('https://example.com/path with spaces')
        'https://example.com/path%20with%20spaces'
    """
    return urllib.parse.quote(uri, safe=":/?#[]@!$&'()*+,;=")




[docs]
def decode_uri(uri: str) -> str:
    """Decode a complete URI (like JavaScript decodeURI).

    Args:
        uri: URI to decode

    Returns:
        Decoded URI

    Examples:
        >>> decode_uri('https://example.com/path%20with%20spaces')
        'https://example.com/path with spaces'
    """
    return urllib.parse.unquote(uri)




[docs]
def build_url(
    protocol: str = "http",
    hostname: str = "",
    port: int | None = None,
    pathname: str = "",
    query_params: dict[str, str] | None = None,
    hash_fragment: str = "",
) -> str:
    """Build a URL from components.

    Args:
        protocol: Protocol (without colon)
        hostname: Hostname
        port: Port number
        pathname: Path
        query_params: Query parameters
        hash_fragment: Hash fragment

    Returns:
        Complete URL string

    Examples:
        >>> build_url(
        ...     hostname='example.com',
        ...     pathname='/api/users',
        ...     query_params={'page': '1'}
        ... )
        'http://example.com/api/users?page=1'
    """
    if not hostname:
        raise ValueError("hostname is required")

    # Build base URL
    url = f"{protocol}://{hostname}"

    # Add port if specified and not default
    if port and not (
        (protocol == "http" and port == 80) or (protocol == "https" and port == 443)
    ):
        url += f":{port}"

    # Add pathname
    if pathname:
        if not pathname.startswith("/"):
            pathname = "/" + pathname
        url += pathname

    # Add query parameters
    if query_params:
        query_string = urllib.parse.urlencode(query_params)
        url += "?" + query_string

    # Add hash fragment
    if hash_fragment:
        if not hash_fragment.startswith("#"):
            hash_fragment = "#" + hash_fragment
        url += hash_fragment

    return url




[docs]
def is_valid_url(url: str) -> bool:
    """Check if a string is a valid URL.

    Args:
        url: String to check

    Returns:
        True if valid URL, False otherwise

    Examples:
        >>> is_valid_url('https://example.com')
        True
        >>> is_valid_url('not-a-url')
        False
        >>> is_valid_url('ftp://files.example.com')
        True
    """
    try:
        result = urllib.parse.urlparse(url)
        return all([result.scheme, result.netloc])
    except Exception:
        return False




[docs]
def get_domain(url: str) -> str | None:
    """Extract domain from URL.

    Args:
        url: URL string

    Returns:
        Domain string or None if invalid

    Examples:
        >>> get_domain('https://www.example.com/path')
        'www.example.com'
        >>> get_domain('http://subdomain.example.org:8080/api')
        'subdomain.example.org'
        >>> get_domain('invalid-url')
        None
    """
    try:
        parsed = urllib.parse.urlparse(url)
        if not parsed.scheme or not parsed.netloc:
            return None
        return parsed.hostname
    except Exception:
        return None




[docs]
def get_query_params(url: str) -> dict[str, str]:
    """Extract query parameters from URL.

    Args:
        url: URL string

    Returns:
        Dictionary of query parameters

    Examples:
        >>> get_query_params(
        ...     'https://example.com/path?foo=bar&baz=qux'
        ... )
        {'foo': 'bar', 'baz': 'qux'}
        >>> get_query_params('https://example.com/path')
        {}
    """
    try:
        parsed = urllib.parse.urlparse(url)
        if not parsed.query:
            return {}
        return dict(urllib.parse.parse_qsl(parsed.query, keep_blank_values=True))
    except Exception:
        return {}