From 9fd1f8b66a467e8dfb443f940d97a20972746f37 Mon Sep 17 00:00:00 2001 From: AndreyeuIvan Date: Tue, 7 Jul 2020 22:43:34 +0300 Subject: [PATCH] 1. pip3 install poetry\n 2. poetry add requests(as example) in order to add requeriment into poetry --- pyvideo_scrape_mpm/LICENSE | 21 ++ pyvideo_scrape_mpm/README.rst | 0 pyvideo_scrape_mpm/poetry.lock | 253 ++++++++++++++++++ pyvideo_scrape_mpm/pyproject.toml | 17 ++ .../pyvideo_scrape_mpm/__init__.py | 1 + .../pyvideo_scrape_mpm/scrape.py | 121 +++++++++ pyvideo_scrape_mpm/requirements.txt | 3 + pyvideo_scrape_mpm/tests/__init__.py | 0 .../tests/test_pyvideo_scrape_mpm.py | 5 + 9 files changed, 421 insertions(+) create mode 100644 pyvideo_scrape_mpm/LICENSE create mode 100644 pyvideo_scrape_mpm/README.rst create mode 100644 pyvideo_scrape_mpm/poetry.lock create mode 100644 pyvideo_scrape_mpm/pyproject.toml create mode 100644 pyvideo_scrape_mpm/pyvideo_scrape_mpm/__init__.py create mode 100644 pyvideo_scrape_mpm/pyvideo_scrape_mpm/scrape.py create mode 100644 pyvideo_scrape_mpm/requirements.txt create mode 100644 pyvideo_scrape_mpm/tests/__init__.py create mode 100644 pyvideo_scrape_mpm/tests/test_pyvideo_scrape_mpm.py diff --git a/pyvideo_scrape_mpm/LICENSE b/pyvideo_scrape_mpm/LICENSE new file mode 100644 index 0000000..ec85c47 --- /dev/null +++ b/pyvideo_scrape_mpm/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Minsk Python Community + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/pyvideo_scrape_mpm/README.rst b/pyvideo_scrape_mpm/README.rst new file mode 100644 index 0000000..e69de29 diff --git a/pyvideo_scrape_mpm/poetry.lock b/pyvideo_scrape_mpm/poetry.lock new file mode 100644 index 0000000..1853403 --- /dev/null +++ b/pyvideo_scrape_mpm/poetry.lock @@ -0,0 +1,253 @@ +[[package]] +category = "dev" +description = "Atomic file writes." +marker = "sys_platform == \"win32\"" +name = "atomicwrites" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "1.4.0" + +[[package]] +category = "dev" +description = "Classes Without Boilerplate" +name = "attrs" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "19.3.0" + +[package.extras] +azure-pipelines = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "pytest-azurepipelines"] +dev = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "sphinx", "pre-commit"] +docs = ["sphinx", "zope.interface"] +tests = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"] + +[[package]] +category = "dev" +description = "Cross-platform colored terminal text." +marker = "sys_platform == \"win32\"" +name = "colorama" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "0.4.3" + +[[package]] +category = "dev" +description = "Read metadata from Python packages" +marker = "python_version < \"3.8\"" +name = "importlib-metadata" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" +version = "1.7.0" + +[package.dependencies] +zipp = ">=0.5" + +[package.extras] +docs = ["sphinx", "rst.linker"] +testing = ["packaging", "pep517", "importlib-resources (>=1.3)"] + +[[package]] +category = "dev" +description = "More routines for operating on iterables, beyond itertools" +name = "more-itertools" +optional = false +python-versions = ">=3.5" +version = "8.4.0" + +[[package]] +category = "dev" +description = "Core utilities for Python packages" +name = "packaging" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "20.4" + +[package.dependencies] +pyparsing = ">=2.0.2" +six = "*" + +[[package]] +category = "dev" +description = "plugin and hook calling mechanisms for python" +name = "pluggy" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "0.13.1" + +[package.dependencies] +[package.dependencies.importlib-metadata] +python = "<3.8" +version = ">=0.12" + +[package.extras] +dev = ["pre-commit", "tox"] + +[[package]] +category = "dev" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +name = "py" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "1.9.0" + +[[package]] +category = "dev" +description = "Python parsing module" +name = "pyparsing" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +version = "2.4.7" + +[[package]] +category = "dev" +description = "pytest: simple powerful testing with Python" +name = "pytest" +optional = false +python-versions = ">=3.5" +version = "5.4.3" + +[package.dependencies] +atomicwrites = ">=1.0" +attrs = ">=17.4.0" +colorama = "*" +more-itertools = ">=4.0.0" +packaging = "*" +pluggy = ">=0.12,<1.0" +py = ">=1.5.0" +wcwidth = "*" + +[package.dependencies.importlib-metadata] +python = "<3.8" +version = ">=0.12" + +[package.extras] +checkqa-mypy = ["mypy (v0.761)"] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] + +[[package]] +category = "main" +description = "A Python Slugify application that handles Unicode" +name = "python-slugify" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "4.0.1" + +[package.dependencies] +text-unidecode = ">=1.3" + +[package.extras] +unidecode = ["Unidecode (>=1.1.1)"] + +[[package]] +category = "dev" +description = "Python 2 and 3 compatibility utilities" +name = "six" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +version = "1.15.0" + +[[package]] +category = "main" +description = "The most basic Text::Unidecode port" +name = "text-unidecode" +optional = false +python-versions = "*" +version = "1.3" + +[[package]] +category = "dev" +description = "Measures the displayed width of unicode strings in a terminal" +name = "wcwidth" +optional = false +python-versions = "*" +version = "0.2.5" + +[[package]] +category = "main" +description = "YouTube video downloader" +name = "youtube-dl" +optional = false +python-versions = "*" +version = "2020.6.16.1" + +[[package]] +category = "dev" +description = "Backport of pathlib-compatible object wrapper for zip files" +marker = "python_version < \"3.8\"" +name = "zipp" +optional = false +python-versions = ">=3.6" +version = "3.1.0" + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] +testing = ["jaraco.itertools", "func-timeout"] + +[metadata] +content-hash = "4ddd4133d2da930966fcfe58494b6935da3eb636a5e74f0d3067d3346e1e0375" +python-versions = "^3.6" + +[metadata.files] +atomicwrites = [ + {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, + {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, +] +attrs = [ + {file = "attrs-19.3.0-py2.py3-none-any.whl", hash = "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c"}, + {file = "attrs-19.3.0.tar.gz", hash = "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"}, +] +colorama = [ + {file = "colorama-0.4.3-py2.py3-none-any.whl", hash = "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff"}, + {file = "colorama-0.4.3.tar.gz", hash = "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"}, +] +importlib-metadata = [ + {file = "importlib_metadata-1.7.0-py2.py3-none-any.whl", hash = "sha256:dc15b2969b4ce36305c51eebe62d418ac7791e9a157911d58bfb1f9ccd8e2070"}, + {file = "importlib_metadata-1.7.0.tar.gz", hash = "sha256:90bb658cdbbf6d1735b6341ce708fc7024a3e14e99ffdc5783edea9f9b077f83"}, +] +more-itertools = [ + {file = "more-itertools-8.4.0.tar.gz", hash = "sha256:68c70cc7167bdf5c7c9d8f6954a7837089c6a36bf565383919bb595efb8a17e5"}, + {file = "more_itertools-8.4.0-py3-none-any.whl", hash = "sha256:b78134b2063dd214000685165d81c154522c3ee0a1c0d4d113c80361c234c5a2"}, +] +packaging = [ + {file = "packaging-20.4-py2.py3-none-any.whl", hash = "sha256:998416ba6962ae7fbd6596850b80e17859a5753ba17c32284f67bfff33784181"}, + {file = "packaging-20.4.tar.gz", hash = "sha256:4357f74f47b9c12db93624a82154e9b120fa8293699949152b22065d556079f8"}, +] +pluggy = [ + {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, + {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, +] +py = [ + {file = "py-1.9.0-py2.py3-none-any.whl", hash = "sha256:366389d1db726cd2fcfc79732e75410e5fe4d31db13692115529d34069a043c2"}, + {file = "py-1.9.0.tar.gz", hash = "sha256:9ca6883ce56b4e8da7e79ac18787889fa5206c79dcc67fb065376cd2fe03f342"}, +] +pyparsing = [ + {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, + {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, +] +pytest = [ + {file = "pytest-5.4.3-py3-none-any.whl", hash = "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1"}, + {file = "pytest-5.4.3.tar.gz", hash = "sha256:7979331bfcba207414f5e1263b5a0f8f521d0f457318836a7355531ed1a4c7d8"}, +] +python-slugify = [ + {file = "python-slugify-4.0.1.tar.gz", hash = "sha256:69a517766e00c1268e5bbfc0d010a0a8508de0b18d30ad5a1ff357f8ae724270"}, +] +six = [ + {file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"}, + {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"}, +] +text-unidecode = [ + {file = "text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"}, + {file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"}, +] +wcwidth = [ + {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"}, + {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"}, +] +youtube-dl = [ + {file = "youtube_dl-2020.6.16.1-py2.py3-none-any.whl", hash = "sha256:e54b307048bb18164729fb278013af6d5477c69c3d995147205a16f22a61296b"}, + {file = "youtube_dl-2020.6.16.1.tar.gz", hash = "sha256:9fc0389a1bbbeb609a5bb4ad5630dea107a9d1a24c73721c611a78c234309a75"}, +] +zipp = [ + {file = "zipp-3.1.0-py3-none-any.whl", hash = "sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b"}, + {file = "zipp-3.1.0.tar.gz", hash = "sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96"}, +] diff --git a/pyvideo_scrape_mpm/pyproject.toml b/pyvideo_scrape_mpm/pyproject.toml new file mode 100644 index 0000000..e760fdd --- /dev/null +++ b/pyvideo_scrape_mpm/pyproject.toml @@ -0,0 +1,17 @@ +[tool.poetry] +name = "pyvideo_scrape_mpm" +version = "0.1.0" +description = "" +authors = ["AndreyeuIvan "] + +[tool.poetry.dependencies] +python = "^3.6" +python-slugify = "^4.0.1" +youtube-dl = "^2020.6.16" + +[tool.poetry.dev-dependencies] +pytest = "^5.2" + +[build-system] +requires = ["poetry>=0.12"] +build-backend = "poetry.masonry.api" diff --git a/pyvideo_scrape_mpm/pyvideo_scrape_mpm/__init__.py b/pyvideo_scrape_mpm/pyvideo_scrape_mpm/__init__.py new file mode 100644 index 0000000..b794fd4 --- /dev/null +++ b/pyvideo_scrape_mpm/pyvideo_scrape_mpm/__init__.py @@ -0,0 +1 @@ +__version__ = '0.1.0' diff --git a/pyvideo_scrape_mpm/pyvideo_scrape_mpm/scrape.py b/pyvideo_scrape_mpm/pyvideo_scrape_mpm/scrape.py new file mode 100644 index 0000000..32380c1 --- /dev/null +++ b/pyvideo_scrape_mpm/pyvideo_scrape_mpm/scrape.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python +import datetime +import json + +import youtube_dl +import slugify + + +DEFAULT_LANGUAGE = "rus" +DEFAULT_SILENT_MODE = False +DEFAULT_URLS_LIST_FILENAME = "urls.list" + + +JSON_FORMAT_KWARGS = { + "indent": 2, + "separators": (",", ": "), + "sort_keys": True, + "ensure_ascii": False, +} + + +def main(): + list_filename = DEFAULT_URLS_LIST_FILENAME + urls = filter(None, map(str.strip, open(list_filename).readlines())) + videos_meta = filter(None, sum((get_entries(u) for u in urls), [])) + for meta in videos_meta: + prepared_meta = get_prepared_meta(meta) + filename = generate_filename(meta) + with open(filename, "w", encoding="utf8") as json_file: + json.dump(prepared_meta, json_file, **JSON_FORMAT_KWARGS) + + +def get_entries(url): + youtube = youtube_dl.YoutubeDL({"ignoreerrors": True}) + data = youtube.extract_info(url, download=False) + return data.get("entries") or [data] + + +def get_prepared_meta(data): + meta = { + "copyright_text": data["license"], + "description": extract_decsription(data), + "duration": data["duration"], + "language": DEFAULT_LANGUAGE, + "recorded": extract_date_recorded(data), + "related_urls": [ + {"label": "GitHub", "url": "https://github.com/minskpython"}, + ], + "speakers": list(extract_speakers(data)), + "tags": ["minsk", "belarus"], + "thumbnail_url": extract_thumbnail_url(data), + "title": extract_title(data), + "videos": [{"type": "youtube", "url": data["webpage_url"]}], + } + return meta + + +def generate_filename(data): + filename = "%s-%s-%s.json" % ( + extract_date_recorded(data), + slugify.slugify(extract_title(data)), + slugify.slugify("-".join(extract_speakers(data))), + ) + return filename + + +def extract_decsription(data): + separator = "Присоединяйся к нам!" + return sanitize(data["description"].split(separator)[0]) + + +def extract_date_recorded(data): + # date detection logic needs improvement + splitter = "Python Meetup" + title = data["title"] + date_str = title.split(splitter)[-1].split("]")[0].strip() + year, month, day = date_str[-4:], date_str[3:5], date_str[:2] + upload_date = datetime.date(*map(int, (year, month, day))) + return upload_date.isoformat() + + +def extract_title(data): + # TODO: use smart regex here + raw_title = data["title"] + title_parts = raw_title.split("/") + title_position_index = 0 + if len(title_parts) > 1: + extracted_title = title_parts[title_position_index] + return sanitize(extracted_title) + return sanitize(raw_title) + + +def extract_speakers(data): + # TODO: use smart regex here + speaker_names = [] + title = data["title"] + title_parts = title.split("/") + speaker_name_position_index = 1 + if len(title_parts) > 1: + speaker_names.append( + sanitize(title_parts[speaker_name_position_index]) + ) + return speaker_names + + +def extract_thumbnail_url(data): + thumbnail_candidate = data["thumbnail"] + if "hqdefault" in thumbnail_candidate: + if "?sqp" not in thumbnail_candidate: + # hqdefault image without '?sqp' modifier isn't so good + # trying to get more suitable thumbnail... + thumbnail_candidate = data["thumbnails"][-1]["url"] + return thumbnail_candidate + + +def sanitize(title_substring): + return title_substring.replace("\u200b", "").strip() + + +if __name__ == "__main__": + main() diff --git a/pyvideo_scrape_mpm/requirements.txt b/pyvideo_scrape_mpm/requirements.txt new file mode 100644 index 0000000..5c41ca1 --- /dev/null +++ b/pyvideo_scrape_mpm/requirements.txt @@ -0,0 +1,3 @@ +python-slugify +youtube-dl + diff --git a/pyvideo_scrape_mpm/tests/__init__.py b/pyvideo_scrape_mpm/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pyvideo_scrape_mpm/tests/test_pyvideo_scrape_mpm.py b/pyvideo_scrape_mpm/tests/test_pyvideo_scrape_mpm.py new file mode 100644 index 0000000..455e4ed --- /dev/null +++ b/pyvideo_scrape_mpm/tests/test_pyvideo_scrape_mpm.py @@ -0,0 +1,5 @@ +from pyvideo_scrape_mpm import __version__ + + +def test_version(): + assert __version__ == '0.1.0'