Source code for pyutils.url

"""URL utility functions.

This module provides utility functions for working with URLs,
porting JavaScript URL object methods and other URL utilities to Python.
"""

import urllib.parse


[docs] class URLParser: """URL parser class similar to JavaScript URL object. Provides methods to parse and manipulate URLs similar to the JavaScript URL API. """
[docs] def __init__(self, url: str, base: str | None = None): """Initialize URL parser. Args: url: URL string to parse base: Base URL for relative URLs Examples: >>> parser = URLParser('https://example.com/path?query=value#hash') >>> parser.hostname 'example.com' """ if not url: raise ValueError("Invalid URL: empty string") if base and not urllib.parse.urlparse(url).scheme: url = urllib.parse.urljoin(base, url) self._parsed = urllib.parse.urlparse(url) # Validate that we have at least a scheme if not self._parsed.scheme: raise ValueError("Invalid URL: missing scheme") self._query_params = urllib.parse.parse_qs(self._parsed.query)
@property def href(self) -> str: """Get the complete URL. Returns: Complete URL string """ return self._parsed.geturl() @property def protocol(self) -> str: """Get the protocol (scheme) of the URL. Returns: Protocol string (e.g., 'https:') """ return f"{self._parsed.scheme}:" if self._parsed.scheme else "" @property def hostname(self) -> str: """Get the hostname of the URL. Returns: Hostname string """ return self._parsed.hostname or "" @property def port(self) -> str: """Get the port of the URL. Returns: Port string (empty if default port) """ return str(self._parsed.port) if self._parsed.port else "" @property def pathname(self) -> str: """Get the pathname of the URL. Returns: Pathname string (defaults to '/' if empty) """ return self._parsed.path or "/" @property def search(self) -> str: """Get the search (query) string of the URL. Returns: Search string including '?' prefix """ return f"?{self._parsed.query}" if self._parsed.query else "" @property def hash(self) -> str: """Get the hash (fragment) of the URL. Returns: Hash string including '#' prefix """ return f"#{self._parsed.fragment}" if self._parsed.fragment else "" @property def origin(self) -> str: """Get the origin of the URL. Returns: Origin string (protocol + hostname + port) """ if not self._parsed.scheme or not self._parsed.hostname: return "" origin = f"{self._parsed.scheme}://{self._parsed.hostname}" if self._parsed.port: origin += f":{self._parsed.port}" return origin @property def username(self) -> str: """Get the username from the URL. Returns: Username string """ return self._parsed.username or "" @property def password(self) -> str: """Get the password from the URL. Returns: Password string """ return self._parsed.password or ""
[docs] def get_query_params(self) -> dict[str, str]: """Get query parameters as a dictionary. Returns: Dictionary of query parameters Examples: >>> parser = URLParser('https://example.com?name=John&age=30') >>> params = parser.get_query_params() >>> params['name'] 'John' """ if not self._parsed.query: return {} return dict(urllib.parse.parse_qsl(self._parsed.query, keep_blank_values=True))
[docs] def parse_url(url: str, base: str | None = None) -> dict[str, str]: """Parse a URL string into a dictionary. Args: url: URL string to parse base: Base URL for relative URLs Returns: Dictionary with URL components Examples: >>> result = parse_url('https://example.com/path') >>> result['hostname'] 'example.com' """ parser = URLParser(url, base) return { "href": parser.href, "protocol": parser.protocol, "hostname": parser.hostname, "port": parser.port, "pathname": parser.pathname, "search": parser.search, "hash": parser.hash, "origin": parser.origin, "username": parser.username, "password": parser.password, }
[docs] def encode_uri_component(text: str) -> str: """Encode a string for use in a URI component (like JavaScript encodeURIComponent). Args: text: String to encode Returns: Encoded string Examples: >>> encode_uri_component('hello world') 'hello%20world' >>> encode_uri_component('user@example.com') 'user%40example.com' """ return urllib.parse.quote(text, safe="")
[docs] def decode_uri_component(text: str) -> str: """Decode a URI component string (like JavaScript decodeURIComponent). Args: text: String to decode Returns: Decoded string Raises: ValueError: If the string contains invalid percent encoding Examples: >>> decode_uri_component('hello%20world') 'hello world' >>> decode_uri_component('user%40example.com') 'user@example.com' """ import re # Check for invalid percent encoding patterns if re.search(r"%[^0-9A-Fa-f]", text) or re.search( r"%[0-9A-Fa-f][^0-9A-Fa-f]", text ): raise ValueError("Invalid percent encoding") try: return urllib.parse.unquote(text, errors="strict") except UnicodeDecodeError as e: raise ValueError(f"Invalid percent encoding: {e}") from e
[docs] def encode_uri(uri: str) -> str: """Encode a complete URI (like JavaScript encodeURI). Args: uri: URI to encode Returns: Encoded URI Examples: >>> encode_uri('https://example.com/path with spaces') 'https://example.com/path%20with%20spaces' """ return urllib.parse.quote(uri, safe=":/?#[]@!$&'()*+,;=")
[docs] def decode_uri(uri: str) -> str: """Decode a complete URI (like JavaScript decodeURI). Args: uri: URI to decode Returns: Decoded URI Examples: >>> decode_uri('https://example.com/path%20with%20spaces') 'https://example.com/path with spaces' """ return urllib.parse.unquote(uri)
[docs] def build_url( protocol: str = "http", hostname: str = "", port: int | None = None, pathname: str = "", query_params: dict[str, str] | None = None, hash_fragment: str = "", ) -> str: """Build a URL from components. Args: protocol: Protocol (without colon) hostname: Hostname port: Port number pathname: Path query_params: Query parameters hash_fragment: Hash fragment Returns: Complete URL string Examples: >>> build_url( ... hostname='example.com', ... pathname='/api/users', ... query_params={'page': '1'} ... ) 'http://example.com/api/users?page=1' """ if not hostname: raise ValueError("hostname is required") # Build base URL url = f"{protocol}://{hostname}" # Add port if specified and not default if port and not ( (protocol == "http" and port == 80) or (protocol == "https" and port == 443) ): url += f":{port}" # Add pathname if pathname: if not pathname.startswith("/"): pathname = "/" + pathname url += pathname # Add query parameters if query_params: query_string = urllib.parse.urlencode(query_params) url += "?" + query_string # Add hash fragment if hash_fragment: if not hash_fragment.startswith("#"): hash_fragment = "#" + hash_fragment url += hash_fragment return url
[docs] def is_valid_url(url: str) -> bool: """Check if a string is a valid URL. Args: url: String to check Returns: True if valid URL, False otherwise Examples: >>> is_valid_url('https://example.com') True >>> is_valid_url('not-a-url') False >>> is_valid_url('ftp://files.example.com') True """ try: result = urllib.parse.urlparse(url) return all([result.scheme, result.netloc]) except Exception: return False
[docs] def get_domain(url: str) -> str | None: """Extract domain from URL. Args: url: URL string Returns: Domain string or None if invalid Examples: >>> get_domain('https://www.example.com/path') 'www.example.com' >>> get_domain('http://subdomain.example.org:8080/api') 'subdomain.example.org' >>> get_domain('invalid-url') None """ try: parsed = urllib.parse.urlparse(url) if not parsed.scheme or not parsed.netloc: return None return parsed.hostname except Exception: return None
[docs] def get_query_params(url: str) -> dict[str, str]: """Extract query parameters from URL. Args: url: URL string Returns: Dictionary of query parameters Examples: >>> get_query_params( ... 'https://example.com/path?foo=bar&baz=qux' ... ) {'foo': 'bar', 'baz': 'qux'} >>> get_query_params('https://example.com/path') {} """ try: parsed = urllib.parse.urlparse(url) if not parsed.query: return {} return dict(urllib.parse.parse_qsl(parsed.query, keep_blank_values=True)) except Exception: return {}