Szerkesztő:DhanakBot/szulhal.py

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
Ez a bot a magyar wikipédia életrajzi lapjainak születési és halálozási
dátumait jegyzi be a megfelelő év- és nap-lapokra.

A következő argumentumokat ismeri:

    -cat:kategória     az adott kategória szócikkeit járja be

    -catr:kategória    az adott kategória és alkategóriái szócikkeit járja be

    -links:lapnév      minden lapot bejár, ami a lapnév lapról hivatkozva van

    -file:textfájl     a textfájlban hivatkozott lapokat járja be
                       (csak a [[lapnév]] fomájú hivatkozásokat tekinti)

    -load[:pddfájl]    egy korábban elmentett pdd fájlból folytatja a munkát
                       (alapértelmezés: szulhal.pdd)

    -ro                csak olvasás mód teszteléshez, nem módosít a lapokon

    -noworklog         nem tölti fel a munkanaplót a munka végeztével

Minden más paraméter egy lap címeként értelmeződik; ilyenkor csak ezt az
egy életrajzi lapot dolgozza fel a bot.
"""

__version__='0.3'

import sys, re, time, codecs
import wikipedia, date, pagegenerators, catlib
import szulhalGUI

monthIdx = date.formats['MonthName'][wikipedia.getSite().language()]

yearre  = re.compile("([0-9]{3,4})")
monthre = re.compile("(?:" + '|'.join(date.makeMonthNamedList(wikipedia.getSite().language(), '%s')) + ")",
                     re.IGNORECASE)
dayre   = re.compile("(" + monthre.pattern + ") +([0-9]{1,2})", re.IGNORECASE)

datere  = re.compile(yearre.pattern + "(?:\]\])?\.? *(?:\[\[)?(" + monthre.pattern + " [0-9]{1,2})?",
                     re.IGNORECASE)
headre  = re.compile(u"^.*?\((.*)\) *[.:;,-]? *(.*?)(?:\.(?! *század)|;|$)", re.IGNORECASE)

log = codecs.open("szulhal.log", "w+", "utf-8")

class SzulHalBot:
    def __init__(self, generator, headerLines = 10, savedState = None,
                 readOnly = False, noLog = False):
        self.generator = generator
        self.headerLines = headerLines
        self.savedState = savedState
        self.readOnly = readOnly
        self.noLog = noLog
        self.modifications = False


    def run(self):
        data = []

        log.write("= %s =\n" % time.asctime(time.gmtime(time.time())))
        log.write(u"== Nem felismert életrajzi lapok ==\n")
        
        for page in self.generator:
            try:
                text = page.get()
            except wikipedia.NoPage:
                wikipedia.output(u'HIBA: Nem találom a %s lapot' % page.title())
                continue
            except wikipedia.IsRedirectPage:
                continue

            for line in text.splitlines()[0:self.headerLines]:
                hdmatch = headre.match(line)
                if hdmatch:
                    dates = (datere.findall(hdmatch.group(1)) + [(None, None)])
                else:
                    dates = []
                if len(dates) >= 2:
                    person = [page.title()] + list(dates[0]) + list(dates[1]) + [hdmatch.group(2)]
                    data.append((line, person))
                    break
            else:
                log.write("* [[%s]]\n" % page.title())
                wikipedia.output(u"%s első %d sorában nem találtam a mintára illeszkedő fejet!" %
                                 (page.title(), self.headerLines))

        data = szulhalGUI.SzulHalGUI(data, self.savedState).display()
        if not data:
            return

        for person in data[1]:
            log.write("* [[%s]]\n" % person[0])
        data = data[0]

        log.write(u"== Sikeresen felismert életrajzi lapok ==\n")
        
        for person in data:
            self.checkPerson(*person)

        if not self.noLog and self.modifications:
            log.seek(0)
            page = wikipedia.Page(wikipedia.getSite(), u"User:DhanakBot/Napló")
            try:
                naplo = page.get()
                page.put("".join(log.readlines()) + naplo, u"Robot: Elvégzett munka naplózása")
            except (wikipedia.NoPage, wikipedia.IsRedirectPage):
                wikipedia.output(u"A naplót nem tudtam frissíteni")


    def checkPerson(self, person, birthyear, birthday, deathyear, deathday, desc):
        """
        Egy személy összes adatát frissítí a (legfeljebb) négy dátumlapon.
        """
        log.write("* [[%s]]\n" % person)
        pages = []
        if birthyear and birthday:
            pages += [
                self.checkDatePage(birthyear, u"Robot: %s születése" % person, u"Születések",
                                   person, dayCmp, birthday, u'†', deathyear, desc),
                self.checkDatePage(birthday, u"Robot: %s születése" % person, u"Születések",
                                   person, yearCmp, birthyear, u'†', deathyear, desc)
                ]

        if deathyear and deathday:
            pages += [
                self.checkDatePage(deathyear, u"Robot: %s halála" % person, u"Halálozások",
                                   person, dayCmp, deathday, u'*', birthyear, desc),
                self.checkDatePage(deathday, u"Robot: %s halála" % person, u"Halálozások",
                                   person, yearCmp, deathyear, u'*', birthyear, desc)
                ]

        if not self.readOnly:
            for (page, newText, comment) in filter(None, pages):
                try:
                    page.put(newText, comment)
                    self.modifications = True
                except wikipedia.EditConflict, arg:
                    log.write(u"** HIBA: %s\n" % arg.args[0])
                    wikipedia.output("HIBA: %s" % arg.args[0])


    def checkDatePage(self, pageTitle, modCmt, *changeArgs):
        """
        Egy személy adatait frissíti a pageTitle dátumlapon.
        """
        log.write("** [[%s]] " % pageTitle)
        try:
            page = wikipedia.Page(wikipedia.getSite(), pageTitle)
            text = page.get()
            newText = self.changeSection(text, *changeArgs);
            wikipedia.output("\n>>>>> %s <<<<<\n" % pageTitle)
            wikipedia.showDiff(text, newText)
            if (text != newText):
                log.write(u"módosítva\n")
                return (page, newText, modCmt)
            else:
                log.write(u"módosítása nem szükséges\n")
        except SzulHalExn, arg:
            wikipedia.output(arg.args[0])
            log.write(arg.args[0] + "\n")
        except wikipedia.NoPage:
            log.write(u"nem létezik\n")
            wikipedia.output(u'Nem találom a %s lapot' % page.title())
        except wikipedia.IsRedirectPage:
            log.write(u"redirekt\n")
            pass

    def changeSection(self, text, section, *changeArgs):
        sectionre = re.compile("(== *" + section +
                               u" *==\n)(.*?)(?===|{{|\[\[en:|\[\[Kategória:)", re.DOTALL|re.IGNORECASE)
        (newText,subCnt) = sectionre.subn(InsertBirthDeath(*changeArgs), text, 1)
        if subCnt == 0:
            raise SzulHalExn(u"HIBA: nem találom a %s szakaszt" % section)
        return newText


class SzulHalExn(Exception):
    """Hiba"""

class InvalidLine(Exception):
    """Ismeretlen szerkezetű sor"""

class InsertBirthDeath:
    def __init__(self, personPage, whenCmp, when, otherChar, otherYear, description):
        self.personPage = personPage
        self.person = personPage.split(" (")[0]
        self.whenCmp = whenCmp
        self.when = when
        self.otherChar = otherChar
        self.otherYear = otherYear
        self.description = description
        

    def __call__(self, sectionMatch):
        sectionLines = sectionMatch.group(2).splitlines()
        if self.otherYear:
            otherText = " (" + self.otherChar + " [[" + self.otherYear + "]])"
        else:
            otherText = ""

        try:
            for i in range(len(sectionLines)):
                sectionLines[i] = self.tidy(sectionLines[i])
                
            for i in range(len(sectionLines)):
                if re.search(self.person, sectionLines[i]):
                    if otherText and not re.search(self.otherYear, sectionLines[i]):
                        sectionLines[i] += otherText
                    break
            else:
                line = "* [[" + self.when + "]]"
                if not yearre.match(self.when):
                    line += "."
                line += u" – [[" + self.personPage;
                if self.person != self.personPage:
                    line += "|" + self.person;
                line += "]]"
                if self.description:
                    line += " " + self.description
                line += otherText
                sectionLines.append(line)

            sectionLines.sort(self.whenCmp)

            return sectionMatch.group(1) + '\n'.join(sectionLines) + "\n"

        except InvalidLine, arg:
            raise SzulHalExn(u"HIBA: ismeretlen szerkezetű sor: '%s'" % arg.args[0])


    def tidy(self, line):
        line = line.strip()
        if line == "":
            return line
        
        match = tidyre.match(line)
        if not match:
            wikipedia.output(u"FIGYELEM: Ennek a sornak nem ismertem fel a szerkezetét: '%s'" % line)
            return line

        line = "* [[" + match.group(1) + "]]"
        if not yearre.search(match.group(1)):
            line += "."
        line += u" – " + match.group(5) # groups 2, 3 and 4 are in yearre and dayre

        if match.group(6) and match.group(7): # otherchar and otheryear
            line += " ("
            if match.group(6) == '*' or match.group(6).startswith("sz"):
                line += "*"
            else:
                line += u"†"
            line += " [[" + match.group(7) + "]])"
            
        return line
        

tidyre = re.compile("^\* *(?:\[\[)?" +
                    "(" + yearre.pattern + "|" + dayre.pattern + ")" +
                    u"(?:\]\])?(?: *\.)? *[-–] *(.*?) *[.;]? *" +
                    u"(?: *\( *([*†+]|sz\S*?|m\S*?|†)(?: *| )(?:\[\[)?" +
                    yearre.pattern + "(?:\]\])? *\) *[.;]? *)?$", re.IGNORECASE)

class DateCmp:
    def __init__(self, regex, convert):
        self.regex = regex
        self.convert = convert

    def __call__(self, str1, str2):
        if str1 == "" or str2 == "":
           return cmp(str1 == "", str2 == "") # True is larger than False
        
        (match1, match2) = map(self.regex.search, (str1, str2))
        if not match1:
            raise InvalidLine(str1)
        if not match2:
            raise InvalidLine(str2)

        return cmp(self.convert(*match1.groups()), self.convert(*match2.groups()))

def dayCnvt(monthstr, daystr):
    return (monthIdx(monthstr.lower()), int(daystr))

yearCmp = DateCmp(yearre, int)
dayCmp  = DateCmp(dayre, dayCnvt)


def main():
    # page generator
    gen = None
    savedState = None
    readOnly = False
    noLog = False
    pageTitle = []
    for arg in wikipedia.handleArgs():
        if arg.startswith('-file:'):
            gen = pagegenerators.TextfilePageGenerator(arg[6:])
        elif arg.startswith('-cat:'):
            cat = catlib.Category(wikipedia.getSite(), u"Kategória:" + arg[5:])
            gen = pagegenerators.CategorizedPageGenerator(cat)
        elif arg.startswith('-catr:'):
            cat = catlib.Category(wikipedia.getSite(), u"Kategória:" + arg[6:])
            gen = pagegenerators.CategorizedPageGenerator(cat, recurse=True)
        elif arg.startswith('-links:'):
            page = wikipedia.Page(wikipedia.getSite(), arg[7:])
            gen = pagegenerators.LinkedPageGenerator(page)
        elif arg.startswith('-load'):
            savedState = arg[6:] or 'szulhal.pdd'
            gen = iter([])
        elif arg == '-ro':
            readOnly = True
        elif arg == '-noworklog':
            noLog = True
        else:
            pageTitle.append(arg)

    if pageTitle:
        page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
        gen = iter([page])

    if not gen and not savedState:
        wikipedia.showHelp('szulhal')
    else:
        preloadinggen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20)
        bot = SzulHalBot(preloadinggen, savedState = savedState,
                         readOnly = readOnly, noLog = noLog)
        bot.run()


if __name__ == "__main__":
    try:
        main()
        
    finally:
        wikipedia.stopme()