public-apis/scripts/validate/links.py

# -*- coding: utf-8 -*-

import re
import sys
import random
from typing import List, Tuple

import requests


def find_links_in_text(text: str) -> List[str]:
    """Find links in a text and return a list of URLs."""

    link_pattern = re.compile(r'((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))')

    raw_links = re.findall(link_pattern, text)

    links = [
        str(raw_link[0]).rstrip('/') for raw_link in raw_links
    ]

    return links


def find_links_in_file(filename: str) -> List[str]:
    """Find links in a file and return a list of URLs from text file."""

    with open(filename, mode='r', encoding='utf-8') as file:
        readme = file.read()
        index_section = readme.find('## Index')
        content = readme[index_section:]

    links = find_links_in_text(content)

    return links


def check_duplicate_links(links: List[str]) -> Tuple[bool, List]:
    """Check for duplicated links.

    Returns a tuple with True or False and duplicate list.
    """

    seen = {}
    duplicates = []
    has_duplicate = False

    for link in links:
        if link not in seen:
            seen[link] = 1
        else:
            if seen[link] == 1:
                duplicates.append(link)

    if duplicates:
        has_duplicate = True

    return (has_duplicate, duplicates)


def fake_user_agent() -> str:
    """Faking user agent as some hosting services block not-whitelisted UA."""

    user_agents = [
        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
    ]

    return random.choice(user_agents)


def get_host_from_link(link: str) -> str:

    host = link.split('://', 1)[1] if '://' in link else link

    # Remove routes, arguments and anchors
    if '/' in host:
        host = host.split('/', 1)[0]

    elif '?' in host:
        host = host.split('?', 1)[0]

    elif '#' in host:
        host = host.split('#', 1)[0]

    return host


def check_if_link_is_working(link: str) -> Tuple[bool, str]:
    """Checks if a link is working.

    If an error is identified when the request for the link occurs,
    the return will be a tuple with the first value True and the second
    value a string containing the error message.

    If no errors are identified, the return will be a tuple with the
    first value False and the second an empty string.
    """

    has_error = False
    error_message = ''

    try:
        resp = requests.get(link + '/', timeout=25, headers={
            'User-Agent': fake_user_agent(),
            'host': get_host_from_link(link)
        })

        code = resp.status_code
        if code >= 400:
            has_error = True
            error_message = f'ERR:CLT: {code} : {link}'

    except (TimeoutError, requests.exceptions.ConnectTimeout):
        has_error = True
        error_message = f'ERR:TMO: {link}'

    except requests.exceptions.SSLError as error:
        has_error = True
        error_message = f'ERR:SSL: {error} : {link}'

    except requests.exceptions.TooManyRedirects as error:
        has_error = True
        error_message = f'ERR:TMR: {error} : {link}'

    except Exception as error:
        has_error = True
        error_message = f'ERR:UKN: {error} : {link}'

    return (has_error, error_message)


if __name__ == '__main__':
    num_args = len(sys.argv)

    if num_args < 2:
        print('No .md file passed')
        sys.exit(1)

    links = find_links_in_file(sys.argv[1])

    print('Checking for duplicate links...')

    has_duplicate_link, duplicates_links = check_duplicate_links(links)

    if has_duplicate_link:
        print(f'Found duplicate links: {duplicates_links}')
    else:
        print('No duplicate links.')
Implement functions to find links in a text/file 2022-01-11 07:59:18 +01:00			`# -- coding: utf-8 --`

			`import re`
Check if a link is working 2022-01-12 04:55:23 +01:00			`import sys`
			`import random`
Refactor duplicate link checker 2022-01-12 00:58:05 +01:00			`from typing import List, Tuple`
Implement functions to find links in a text/file 2022-01-11 07:59:18 +01:00
Check if a link is working 2022-01-12 04:55:23 +01:00			`import requests`

Implement functions to find links in a text/file 2022-01-11 07:59:18 +01:00
			`def find_links_in_text(text: str) -> List[str]:`
			`"""Find links in a text and return a list of URLs."""`

			link_pattern = re.compile(r'((?:https?://\|www\d{0,3}[.]\|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+\|\(([^\s()<>]+\|(\([^\s()<>]+\)))\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))')

			`raw_links = re.findall(link_pattern, text)`

			`links = [`
			`str(raw_link[0]).rstrip('/') for raw_link in raw_links`
			`]`

			`return links`


			`def find_links_in_file(filename: str) -> List[str]:`
			`"""Find links in a file and return a list of URLs from text file."""`

			`with open(filename, mode='r', encoding='utf-8') as file:`
			`readme = file.read()`
			`index_section = readme.find('## Index')`
			`content = readme[index_section:]`

			`links = find_links_in_text(content)`

			`return links`


Refactor duplicate link checker 2022-01-12 00:58:05 +01:00			`def check_duplicate_links(links: List[str]) -> Tuple[bool, List]:`
			`"""Check for duplicated links.`
Implement functions to check duplicate links 2022-01-11 08:33:18 +01:00
Refactor duplicate link checker 2022-01-12 00:58:05 +01:00			`Returns a tuple with True or False and duplicate list.`
			`"""`
Implement functions to check duplicate links 2022-01-11 08:33:18 +01:00
			`seen = {}`
			`duplicates = []`
			`has_duplicate = False`

			`for link in links:`
			`if link not in seen:`
			`seen[link] = 1`
			`else:`
			`if seen[link] == 1:`
			`duplicates.append(link)`

Refactor duplicate link checker 2022-01-12 00:58:05 +01:00			`if duplicates:`
Implement functions to check duplicate links 2022-01-11 08:33:18 +01:00			`has_duplicate = True`
Refactor duplicate link checker 2022-01-12 00:58:05 +01:00
			`return (has_duplicate, duplicates)`
Implement functions to check duplicate links 2022-01-11 08:33:18 +01:00

Check if a link is working 2022-01-12 04:55:23 +01:00			`def fake_user_agent() -> str:`
			`"""Faking user agent as some hosting services block not-whitelisted UA."""`

			`user_agents = [`
			`'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',`
			`'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)',`
			`'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',`
			`'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',`
			`]`

			`return random.choice(user_agents)`


			`def get_host_from_link(link: str) -> str:`

Make sure the link contains :// 2022-01-12 05:23:59 +01:00			`host = link.split('://', 1)[1] if '://' in link else link`
Check if a link is working 2022-01-12 04:55:23 +01:00
			`# Remove routes, arguments and anchors`
			`if '/' in host:`
			`host = host.split('/', 1)[0]`

			`elif '?' in host:`
			`host = host.split('?', 1)[0]`

			`elif '#' in host:`
			`host = host.split('#', 1)[0]`

			`return host`


			`def check_if_link_is_working(link: str) -> Tuple[bool, str]:`
			`"""Checks if a link is working.`
Remove whitespaces 2022-01-12 04:58:54 +01:00
Check if a link is working 2022-01-12 04:55:23 +01:00			`If an error is identified when the request for the link occurs,`
			`the return will be a tuple with the first value True and the second`
			`value a string containing the error message.`

			`If no errors are identified, the return will be a tuple with the`
			`first value False and the second an empty string.`
			`"""`

			`has_error = False`
			`error_message = ''`

			`try:`
			`resp = requests.get(link + '/', timeout=25, headers={`
			`'User-Agent': fake_user_agent(),`
			`'host': get_host_from_link(link)`
			`})`

			`code = resp.status_code`
			`if code >= 400:`
			`has_error = True`
			`error_message = f'ERR:CLT: {code} : {link}'`

			`except (TimeoutError, requests.exceptions.ConnectTimeout):`
			`has_error = True`
			`error_message = f'ERR:TMO: {link}'`
Remove whitespaces 2022-01-12 04:58:54 +01:00
Check if a link is working 2022-01-12 04:55:23 +01:00			`except requests.exceptions.SSLError as error:`
			`has_error = True`
			`error_message = f'ERR:SSL: {error} : {link}'`
Remove whitespaces 2022-01-12 04:58:54 +01:00
Check if a link is working 2022-01-12 04:55:23 +01:00			`except requests.exceptions.TooManyRedirects as error:`
			`has_error = True`
			`error_message = f'ERR:TMR: {error} : {link}'`

			`except Exception as error:`
			`has_error = True`
			`error_message = f'ERR:UKN: {error} : {link}'`
Remove whitespaces 2022-01-12 04:58:54 +01:00
Check if a link is working 2022-01-12 04:55:23 +01:00			`return (has_error, error_message)`


Implement functions to find links in a text/file 2022-01-11 07:59:18 +01:00			`if __name__ == '__main__':`
			`num_args = len(sys.argv)`

			`if num_args < 2:`
			`print('No .md file passed')`
			`sys.exit(1)`

			`links = find_links_in_file(sys.argv[1])`
Implement functions to check duplicate links 2022-01-11 08:33:18 +01:00
Refactor duplicate link checker 2022-01-12 00:58:05 +01:00			`print('Checking for duplicate links...')`

			`has_duplicate_link, duplicates_links = check_duplicate_links(links)`

			`if has_duplicate_link:`
			`print(f'Found duplicate links: {duplicates_links}')`
			`else:`
			`print('No duplicate links.')`