mirror of
https://gitlab.crans.org/mediatek/med.git
synced 2025-07-06 09:23:55 +02:00
Fix authors & openlibrary scrap
This commit is contained in:
@ -5,6 +5,8 @@ import re
|
||||
|
||||
import requests
|
||||
|
||||
from media.models import Auteur
|
||||
|
||||
|
||||
class BedetequeScraper:
|
||||
"""
|
||||
@ -75,10 +77,6 @@ class BedetequeScraper:
|
||||
subtitle = subtitle.replace('<span class="numa"></span>', '')
|
||||
data['subtitle'] = ' '.join(subtitle.split())
|
||||
|
||||
# TODO implement author
|
||||
# regex_author = r'author\">([^<]*)</span'
|
||||
# 'author': re.search(regex_author, content).group(1),
|
||||
|
||||
# Get publish date
|
||||
search_publish_date = re.search(regex_publish_date, content)
|
||||
if search_publish_date:
|
||||
@ -92,23 +90,15 @@ class BedetequeScraper:
|
||||
# Get author and illustrator
|
||||
author = re.search(regex_author, content)
|
||||
if 'author' not in data:
|
||||
data['author'] = list()
|
||||
data['authors'] = list()
|
||||
if author:
|
||||
data['author'].append(author.group(1))
|
||||
author_obj = Auteur.objects.get_or_create(
|
||||
name=author.group(1))[0]
|
||||
data['authors'].append(author_obj)
|
||||
illustrator = re.search(regex_illustrator, content)
|
||||
if illustrator:
|
||||
data['author'].append(illustrator.group(1))
|
||||
|
||||
author_name = data['author'][0]
|
||||
if ',' not in author_name and ' ' in author_name:
|
||||
author_name = author_name.split(' ')[1]
|
||||
side_identifier = "{:.3} {:.3}".format(author_name.upper(),
|
||||
data['title'].upper(),)
|
||||
if data['subtitle']:
|
||||
start = data['subtitle'].split(' ')[0].replace('.', '')
|
||||
print("start:", start)
|
||||
if start.isnumeric():
|
||||
side_identifier += " {:0>2}".format(start,)
|
||||
data['side_identifier'] = side_identifier
|
||||
author_obj = Auteur.objects.get_or_create(
|
||||
name=illustrator.group(1))[0]
|
||||
data['authors'].append(author_obj)
|
||||
|
||||
return data
|
||||
|
Reference in New Issue
Block a user