baijiahao_data_crawl/venv/Lib/site-packages/publicsuffix2/__init__.py

# -*- coding: utf-8 -*-
#
# Copyright (c) 2019 nexB Inc. and Renée Burton
# Copyright (c) 2015 nexB Inc.
# This code is based on Tomaž Šolc's fork of David Wilson's code originally at
# https://www.tablix.org/~avian/git/publicsuffix.git
#
# Copyright (c) 2014 Tomaž Šolc <tomaz.solc@tablix.org>
#
# David Wilson's code was originally at:
# from http://code.google.com/p/python-public-suffix-list/
#
# Copyright (c) 2009 David Wilson
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
# The Public Suffix List vendored in this distribution has been downloaded
# from http://publicsuffix.org/public_suffix_list.dat
# This data file is licensed under the MPL-2.0 license.
# http://mozilla.org/MPL/2.0/

"""
Public Suffix List module for Python.
"""

from __future__ import absolute_import
from __future__ import unicode_literals

import codecs
from os import path
import warnings

try:
    from urllib.request import urlopen, Request
except ImportError:
    from urllib2 import urlopen, Request


PSL_URL = 'https://publicsuffix.org/list/public_suffix_list.dat'

BASE_DIR = path.dirname(__file__)
PSL_FILE = path.join(BASE_DIR, 'public_suffix_list.dat')
ABOUT_PSL_FILE = path.join(BASE_DIR, 'public_suffix_list.ABOUT')


class PublicSuffixList(object):

    def __init__(self, psl_file=None, idna=True):
        """
        Read and parse a public suffix list. `psl_file` is either a file
        location string, or a file-like object, or an iterable of lines from a
        public suffix data file.

        If psl_file is None, the vendored file named "public_suffix_list.dat" is
        loaded. It is stored side by side with this Python package.

        The Mozilla public suffix list is no longer IDNA-encoded, it is UTF-8.
        For use cases with domains that are IDNA encoded, choose idna=True and
        the list will be converted upon loading. The wrong encoding will provide
        incorrect answers in either use case.

        The file format is described at http://publicsuffix.org/

        :param psl_file: string or None
        :param idna: boolean, whether to convert file to IDNA-encoded strings
        """
        # Note: we test for None as we accept empty lists as inputs
        if psl_file is None or isinstance(psl_file, str):
            with codecs.open(psl_file or PSL_FILE, 'r', encoding='utf8') as psl:
                psl = psl.readlines()
        else:
            # assume file-like
            psl = psl_file

        # a list of eTLDs with their modifiers, e.g., *
        self.tlds = []
        root = self._build_structure(psl, idna)
        self.root = self._simplify(root)

    def _find_node(self, parent, parts):
        """
        Processing each line of the public suffix list recursively to build the
        Trie.  Each line is processed into a dictionary, which may contain sub-
        Trie, and nodes terminate in node of either 0 or 1 (negate).

        This method takes the current parent Trie, and searches it for the next
        part in the line (child). If not found, it adds a node to the Trie,
        creating a new branch with the [0]. If found, the existing sub-Trie is
        passed for the next part.

        :param parent: current Trie, form is Tuple (negate, dict of Trie)
        :param parts: list of strings
        :return: recursive search for remaining domain parts
        """
        if not parts:
            return parent

        # this initiates the Trie from a new node as [negate, dict()]
        if len(parent) == 1:
            parent.append({})

        assert len(parent) == 2
        _negate, children = parent

        child = parts.pop()

        # if child already exists as a node, grab the sub-Trie
        child_node = children.get(child, None)

        # if it doesn't exist, creates a new node and initialized with [0]
        if not child_node:
            children[child] = child_node = [0]

        return self._find_node(child_node, parts)

    def _add_rule(self, root, rule):
        """
        Initial setup for a line of the public suffix list. If it starts with !
        that is a negation operation. this calls the find_node() method
        recursively to build out the Trie for this rule.

        :param root: root Trie
        :param rule: string, line of public suffixlist
        :return: None
        """
        if rule.startswith('!'):
            negate = 1
            rule = rule[1:]
        else:
            negate = 0

        parts = rule.split('.')
        self._find_node(root, parts)[0] = negate

    def _simplify(self, node):
        """
        Condense the lines of the Trie in place.

        :param node: node in the Trie, either 0/1 or a subTrie
        :return: simplified Trie, form Tuple
        """
        if len(node) == 1:
            return node[0]

        return (node[0], dict((k, self._simplify(v)) for (k, v) in node[1].items()))

    def _build_structure(self, fp, idna):
        """
        Build a Trie from the public suffix list. If idna==True, idna-encode
        each line before building.

        The Trie is comprised of tuples that encode whether the line is a
        negation line (0 or 1), and terminate with 0. Each node is represented
        with two-tuple of the form (negate, dict of children / sub-Trie). A
        partial subTrie therefore looks like: (0, {'ac': 0, 'co': (0,
        {'blogspot': 0}), 'gv': 0,....}) where each tuple starts with the
        negation encoding, and each leaf in the Trie as a dictionary element
        returns 0.

        Also creates an instance attribute, tlds, which simply contains the
        publicsuffix list, with the modifiers such as wildcards, as a list. This
        can be accessed for post-processing by the application.

        :param fp: pointer for the public suffix list
        :param idna: boolean, convert lines to idna-encoded strings
        :return: Trie
        """
        root = [0]

        tlds = self.tlds

        for line in fp:
            line = line.strip()
            if not line or line.startswith('//'):
                continue
            if idna:
                line = line.encode('idna').decode()
            tlds.append(line)

            self._add_rule(root, line.split()[0].lstrip('.'))

        return root

    def _lookup_node(self, matches, depth, parent, parts, wildcard):
        """
        Traverses the Trie recursively to find the parts. By default, the
        traverse follows wildcards, as appropriate for the public suffix list,
        but if wildcard is set to False, it will stop at wildcard leaves. This
        can be useful for summarizing complex wildcard domains like those under
        amazonaws.com.

        The lookup is tracked via a list, initially set to all None, that marks
        the negation flags of nodes it matches. each match will be marked for
        later composition of the eTLD.

        :param matches: list, parts long, None (initial), 0, or 1
        :param depth: int, how far in the Trie this run is
        :param parent: Tuple, the current subTrie
        :param parts: list of domain parts, strings
        :param wildcard: boolean, whether to process wildcard nodes
        :return: None, recursive call
        """
        if wildcard and depth == 1:
            # if no rules match, the prevailing rule is "*"
            # See: Algorithm 2 at https://publicsuffix.org/list/
            matches[-depth] = 0

        if parent in (0, 1):
            return

        children = parent[1]

        if depth <= len(parts) and children:
            for name in ('*', parts[-depth]):
                child = children.get(name, None)
                if child is not None:
                    if wildcard or name != '*':
                        if child in (0, 1):
                            negate = child
                        else:
                            negate = child[0]
                        matches[-depth] = negate
                        self._lookup_node(matches, depth + 1, child, parts, wildcard)

    def get_sld(self, domain, wildcard=True, strict=False):
        """
        Return the second-level-domain (SLD) or private suffix of a given domain
        according to the public suffix list. The public suffix list includes
        wildcards, so if wildcard is set to True, this will follow the wildcard
        on traversal, otherwise it will stop at wildcard nodes.

        The logic does not check by default whether the TLD is in the Trie, so
        for example, 'www.this.local' will return 'this.local'. If you want to
        ensure the TLD is in the public suffix list, use strict=True.

        If domain is already an eTLD, it returns domain as-is instead of None
        value.

        :param domain: string, needs to match the encoding of the PSL (idna or UTF8)
        :param wildcard: boolean, follow wildcard patterns
        :param strict: boolean, check the TLD is valid, return None if not
        :return: string, the SLD for the domain
        """
        if not domain:
            return None

        # for compatibility, set strict True not to allow invalid TLDs
        tld = self.get_tld(domain, wildcard, True)
        if strict and tld is None:
            return None

        parts = domain.lower().strip('.').split('.')
        num_of_tld_parts = 0 if tld is None else tld.count('.') + 1

        if len(parts) <= num_of_tld_parts:
            return tld
        else:
            return '.'.join(parts[-(num_of_tld_parts + 1):])

    def get_public_suffix(self, domain, wildcard=True, strict=False):
        """
        Use get_sld() instead.
        """
        return self.get_sld(domain, wildcard, strict)

    def get_tld(self, domain, wildcard=True, strict=False):
        """
        Return the TLD, or public suffix, of a domain using the public suffix
        list. uses wildcards if set, and checks for valid top TLD is
        strict=True.

        This will return the domain itself when it is an ICANN TLD, e.g., 'com'
        returns 'com', for follow on processing, while 'co.uk' return 'uk'. On
        the other hand, more complicated domains will return their public
        suffix, e.g.,
        'google.co.uk' will return 'co.uk'.  Root ('.') will return empty string.

        :param domain: string
        :param wildcard: boolean, follow wildcards in Trie
        :param strict: boolean, check that top TLD is valid in Trie
        :return: string, the TLD for the domain
        """
        if not domain:
            return None
        parts = domain.lower().strip('.').split('.')
        hits = [None] * len(parts)
        if strict and (
            self.root in (0, 1) or parts[-1] not in self.root[1].keys()
        ):
            return None

        self._lookup_node(hits, 1, self.root, parts, wildcard)

        for i, what in enumerate(hits):
            if what is not None and what == 0:
                return '.'.join(parts[i:])


_PSL = None


def get_sld(domain, psl_file=None, wildcard=True, idna=True, strict=False):
    """
    Return the private suffix or SLD for a `domain` DNS name string. The
    original publicsuffix2 library used the method get_public_suffix() for this
    purpose, but get_private_suffix() is more proper. Convenience function that
    builds and caches a PublicSuffixList object.

    Optionally read, and parse a public suffix list. `psl_file` is either a file
    location string, or a file-like object, or an iterable of lines from a
    public suffix data file.

    If psl_file is None, the vendored file named "public_suffix_list.dat" is
    loaded. It is stored side by side with this Python package.

    The file format is described at http://publicsuffix.org/
    """
    global _PSL
    _PSL = _PSL or PublicSuffixList(psl_file, idna=idna)
    return _PSL.get_sld(domain, wildcard=wildcard, strict=strict)


def get_tld(domain, psl_file=None, wildcard=True, idna=True, strict=False):
    """
    Return the TLD or public suffix for a `domain` DNS name string. (this is
    actually the private suffix that is returned) Convenience function that
    builds and caches a PublicSuffixList object.

    Optionally read, and parse a public suffix list. `psl_file` is either a file
    location string, or a file-like object, or an iterable of lines from a
    public suffix data file.

    If psl_file is None, the vendored file named "public_suffix_list.dat" is
    loaded. It is stored side by side with this Python package.

    The file format is described at http://publicsuffix.org/
    """
    global _PSL
    _PSL = _PSL or PublicSuffixList(psl_file, idna=idna)
    return _PSL.get_tld(domain, wildcard=wildcard, strict=strict)


def get_public_suffix(domain, psl_file=None, wildcard=True, idna=True, strict=False):
    """
    Included for compatibility with the original publicsuffix2 library -- this
    function returns the private suffix or SLD of the domain. To get the public
    suffix, use get_tld(). Convenience function that builds and caches a
    PublicSuffixList object.

    Optionally read, and parse a public suffix list. `psl_file` is either a file
    location string, or a file-like object, or an iterable of lines from a
    public suffix data file.

    If psl_file is None, the vendored file named "public_suffix_list.dat" is
    loaded. It is stored side by side with this Python package.

    The file format is described at http://publicsuffix.org/
    """
    warnings.warn(
        'This function returns the private suffix, SLD, or registrable domain. '
        'This equivalent to function get_sld(). '
        'To get the public suffix itself, use get_tld().',
        UserWarning
    )
    return get_sld(domain, psl_file, wildcard, idna, strict)


def fetch():
    """
    Return a file-like object for the latest public suffix list downloaded from
    publicsuffix.org
    """
    req = Request(PSL_URL, headers={'User-Agent': 'python-publicsuffix2'})
    res = urlopen(req)
    try:
        encoding = res.headers.get_content_charset()
    except AttributeError:
        encoding = res.headers.getparam('charset')
    f = codecs.getreader(encoding)(res)
    return f