1
0
mirror of https://gitlab.crans.org/mediatek/med.git synced 2025-07-06 09:23:55 +02:00

Fix authors & openlibrary scrap

This commit is contained in:
Yohann D'ANELLO
2020-02-10 11:29:26 +01:00
parent c7d804d9bf
commit 7fd8e92371
2 changed files with 45 additions and 20 deletions

View File

@ -5,6 +5,8 @@ import re
import requests
from media.models import Auteur
class BedetequeScraper:
"""
@ -75,10 +77,6 @@ class BedetequeScraper:
subtitle = subtitle.replace('<span class="numa"></span>', '')
data['subtitle'] = ' '.join(subtitle.split())
# TODO implement author
# regex_author = r'author\">([^<]*)</span'
# 'author': re.search(regex_author, content).group(1),
# Get publish date
search_publish_date = re.search(regex_publish_date, content)
if search_publish_date:
@ -92,23 +90,15 @@ class BedetequeScraper:
# Get author and illustrator
author = re.search(regex_author, content)
if 'author' not in data:
data['author'] = list()
data['authors'] = list()
if author:
data['author'].append(author.group(1))
author_obj = Auteur.objects.get_or_create(
name=author.group(1))[0]
data['authors'].append(author_obj)
illustrator = re.search(regex_illustrator, content)
if illustrator:
data['author'].append(illustrator.group(1))
author_name = data['author'][0]
if ',' not in author_name and ' ' in author_name:
author_name = author_name.split(' ')[1]
side_identifier = "{:.3} {:.3}".format(author_name.upper(),
data['title'].upper(),)
if data['subtitle']:
start = data['subtitle'].split(' ')[0].replace('.', '')
print("start:", start)
if start.isnumeric():
side_identifier += " {:0>2}".format(start,)
data['side_identifier'] = side_identifier
author_obj = Auteur.objects.get_or_create(
name=illustrator.group(1))[0]
data['authors'].append(author_obj)
return data