mirror of
https://gitlab.crans.org/mediatek/med.git
synced 2025-07-07 07:04:00 +02:00
Add script to regenerate side identifiers
This commit is contained in:
109
media/forms.py
109
media/forms.py
@ -1,5 +1,5 @@
|
||||
# -*- mode: python; coding: utf-8 -*-
|
||||
# Copyright (C) 2017-2019 by BDE ENS Paris-Saclay
|
||||
# Copyright (C) 2017-2020 by BDE ENS Paris-Saclay
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
import json
|
||||
@ -7,6 +7,7 @@ import re
|
||||
import unicodedata
|
||||
import urllib.request
|
||||
|
||||
from django.db.models import QuerySet
|
||||
from django.forms import ModelForm
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
@ -14,6 +15,60 @@ from .models import Auteur, BD
|
||||
from .scraper import BedetequeScraper
|
||||
|
||||
|
||||
def generate_side_identifier(title, authors, subtitle=None):
|
||||
if isinstance(authors, QuerySet):
|
||||
authors = list(authors)
|
||||
|
||||
title_normalized = title.upper()
|
||||
title_normalized = title_normalized.replace('’', '\'')
|
||||
title_normalized = ''.join(
|
||||
char
|
||||
for char in unicodedata.normalize(
|
||||
'NFKD', title_normalized.casefold())
|
||||
if all(not unicodedata.category(char).startswith(cat)
|
||||
for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
|
||||
).casefold().upper()
|
||||
title_normalized = re.sub(r'^DE ', '', title_normalized)
|
||||
title_normalized = re.sub(r'^LE ', '', title_normalized)
|
||||
title_normalized = re.sub(r'^LA ', '', title_normalized)
|
||||
title_normalized = re.sub(r'^LES ', '', title_normalized)
|
||||
title_normalized = re.sub(r'^L\'', '', title_normalized)
|
||||
title_normalized = re.sub(r'^THE ', '', title_normalized)
|
||||
title_normalized = re.sub(r'Œ', 'OE', title_normalized)
|
||||
title_normalized = title_normalized.replace(' ', '')
|
||||
title_normalized = re.sub("[^A-Z0-9$]", "", title_normalized)
|
||||
authors = authors.copy()
|
||||
|
||||
def sort(author):
|
||||
return str(-author.note) + author.name.split(" ")[-1] + ".{:05d}".format(author.pk)
|
||||
|
||||
authors.sort(key=sort)
|
||||
primary_author = authors[0]
|
||||
author_name = primary_author.name.upper()
|
||||
if ',' not in author_name and ' ' in author_name:
|
||||
author_name = author_name.split(' ')[-1]
|
||||
author_name = ''.join(
|
||||
char for char in unicodedata.normalize('NFKD', author_name.casefold())
|
||||
if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
|
||||
).casefold().upper()
|
||||
author_name = re.sub("[^A-Z]", "", author_name)
|
||||
side_identifier = "{:.3} {:.3}".format(author_name, title_normalized, )
|
||||
if subtitle:
|
||||
subtitle = re.sub(r'</span>', '', subtitle)
|
||||
subtitle = re.sub(r'<span.*>', '', subtitle)
|
||||
start = subtitle.split(' ')[0].replace('.', '')
|
||||
|
||||
if start.isnumeric():
|
||||
side_identifier += " {:0>2}".format(start, )
|
||||
|
||||
# Normalize side identifier, in order to remove accents
|
||||
side_identifier = ''.join(char for char in unicodedata.normalize('NFKD', side_identifier.casefold())
|
||||
if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'})
|
||||
or char == ' ').casefold().upper()
|
||||
|
||||
return side_identifier
|
||||
|
||||
|
||||
class MediaAdminForm(ModelForm):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@ -188,54 +243,12 @@ class MediaAdminForm(ModelForm):
|
||||
)
|
||||
|
||||
if self.cleaned_data['authors']:
|
||||
authors = self.cleaned_data['authors']
|
||||
old_authors = authors.copy()
|
||||
side_identifier = generate_side_identifier(
|
||||
self.cleaned_data["title"],
|
||||
self.cleaned_data["authors"],
|
||||
self.cleaned_data["subtitle"],
|
||||
)
|
||||
|
||||
def sort(author):
|
||||
return str(-author.note) + "." \
|
||||
+ str(old_authors.index(author)) \
|
||||
+ "." + author.name
|
||||
|
||||
authors.sort(key=sort)
|
||||
author_name = self.cleaned_data['authors'][0].name
|
||||
if ',' not in author_name and ' ' in author_name:
|
||||
author_name = author_name.split(' ')[-1]
|
||||
title_normalized = self.cleaned_data['title'].upper()
|
||||
title_normalized = re.sub(r'^LE ', '', title_normalized)
|
||||
title_normalized = re.sub(r'^LA ', '', title_normalized)
|
||||
title_normalized = re.sub(r'^LES ', '', title_normalized)
|
||||
title_normalized = re.sub(r'^L\'', '', title_normalized)
|
||||
title_normalized = re.sub(r'^THE ', '', title_normalized)
|
||||
title_normalized = re.sub(r'Œ', 'OE', title_normalized)
|
||||
side_identifier = "{:.3} {:.3}".format(
|
||||
author_name,
|
||||
title_normalized.replace(' ', ''), )
|
||||
|
||||
if self.cleaned_data['subtitle']:
|
||||
self.cleaned_data['subtitle'] = re.sub(
|
||||
r'</span>',
|
||||
'',
|
||||
self.cleaned_data['subtitle']
|
||||
)
|
||||
self.cleaned_data['subtitle'] = re.sub(
|
||||
r'<span.*>',
|
||||
'',
|
||||
self.cleaned_data['subtitle']
|
||||
)
|
||||
start = self.cleaned_data['subtitle'].split(' ')[0] \
|
||||
.replace('.', '')
|
||||
|
||||
if start.isnumeric():
|
||||
side_identifier += " {:0>2}".format(start, )
|
||||
|
||||
# Normalize side identifier, in order to remove accents
|
||||
side_identifier = ''.join(
|
||||
char
|
||||
for char in unicodedata.normalize(
|
||||
'NFKD', side_identifier.casefold())
|
||||
if all(not unicodedata.category(char).startswith(cat)
|
||||
for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
|
||||
).casefold().upper()
|
||||
self.cleaned_data['side_identifier'] = side_identifier
|
||||
|
||||
return self.cleaned_data
|
||||
|
Reference in New Issue
Block a user