1
0
mirror of https://gitlab.crans.org/mediatek/med.git synced 2025-07-07 09:44:01 +02:00

Split comic strips and mangas

This commit is contained in:
Yohann D'ANELLO
2020-05-21 16:56:41 +02:00
parent 1657f5c42c
commit 82efeba272
10 changed files with 202 additions and 12 deletions

View File

@ -58,6 +58,7 @@ class BedetequeScraper:
regex_subtitle = r'<h2>\s*(.*)</h2>'
regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
regex_format = r'<label>Format : </label>Format (\w+)</li>'
regex_author = r'<span itemprop=\"author\">(((?!<).)*)</span>'
regex_illustrator = r'span itemprop=\"illustrator\">(((?!<).)*)</span'
@ -89,6 +90,11 @@ class BedetequeScraper:
elif 'number_of_pages' not in data:
data['number_of_pages'] = 0
# Get format of the book
search_format = re.search(regex_format, content)
if search_format:
data['format'] = search_format.group(1).lower()
# Get author and illustrator
author = re.search(regex_author, content)
if 'author' not in data: