Wikiproyecto:Bots/Repositorio/artículos-redirecciones.py

De Wikipedia, la enciclopedia libre

actualizar · discusión · código desprotegido  

Información de fichero
  • Nombre del fichero: artículos-redirecciones.py
  • Lenguaje: Python
  • Estado: no protegido
Detalles de edición
  • Detalles:
Script de BOTijo (disc. · contr. · bloq.) para crear redirecciones sin acentos (a partir de otros artículos). Originalmente tarea034.py
# -*- coding: utf-8 -*-

# Copyright (C) 2009 emijrp
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# Create redirects without diacrits to articles or redirects with diacritics

import argparse, codecs, re, time
from datetime import datetime

import os, sys
sys.path.append(os.path.split(os.getcwd())[0])

from wikipedia import Page, Site, output as display, stopme
import pagegenerators as pg, query as api

pairs={
    u"àáâäãăǎąåā": "a", u'æǣ': "ae",
    u'ḃɓ': "b",
    u'çćčćĉċ': "c",
    u'đḍďḋð': "d",
    u'èéêëẽēę': "e",
    u'ḟƒ': "f",
    u'ĝġģğ': "g",
    u'ĥħ': "h",
    u'ìíîïīį': "i", u'ij': "ij",
    u'ĵ': "j",
    u'ķ': "k",
    u'ŀļḷḹľł': "l",
    u'ñńň': "n",
    u'òóôöõøōǫ': "o",
    u'œ': "oe",
    u'ṗ': "p",
    u'ŗřṛṝ': "r",
    u'şṡšŝ': "s", u'ß': "sz",
    u'ţṫṭ': "t",
    u'Þ': "th",
    u'ùúûüŭūų': "u",
    u'ẁŵẅƿ': "w",
    u'ýỳŷÿȳỹ': "y",
    u'źžż': "z"
}
diacritics = "".join(pairs.keys())

def simplify_chars(string):
    word=""
    for ch in unicode(string):
        is_upper = ch != ch.lower()
        if ch.lower() in diacritics:
            for keys in pairs:
                if ch.lower() in keys:
                    ch = pairs[keys]
                    break
        if is_upper: ch=ch.upper()
        word += ch
    word=word.replace(u"l·l","ll")
    #word = re.sub("\W","!", word)
    return word

def timedelta(td):
    #get the timedelta obejct and returns also hours, minutes and seconds
    #by accessing to .seconds atribute.
    td = datetime.now()-datetime.fromtimestamp(td)
    hours, remainder = divmod(td.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    result = "%s%s%s%s" %(
        "%i d" %  td.days if td.days else "",
        " %i h" % hours if hours else "",
        " %i m" % minutes if minutes else "",
        " %i s" % seconds if seconds else "",
    )
    if not result: result ="0 s %s ms" % str(td.microseconds).rstip("0")
    return result.strip(), td.days, hours, minutes, seconds

def get_filename(filename="wikipage"):
    user = sys.path[0].split("/")[2]
    if not args.path:
        path = "/home/%(u)s/temp/" % {"u": user}
    else:
        path = args.path
    if path.startswith("*"):
        path = path.replace("*/", "%s/" % os.getcwd())
    if not path.endswith("/"):
        path = "%s/" % path
    return "%(p)s%(l)s%(f)s.log" % {"l":args.lang, "p": path, "f": filename}

def get_sql(query, filename="wikipage"):
    fdata = {"l": args.lang, "p": path, "q": query, "f": filename}
    os.system(
        """mysql -h %(l)swiki-p.db.toolserver.org -e"""
        """ "use %(l)swiki_p;%(q)s" """
        """> %(p)s%(l)s%(f)s.log""" % fdata
    )
    f=codecs.open(get_filename(filename), 'r', encoding="utf-8")
    lines = f.readlines()
    f.close()
    return lines

def load_from_cache():
    f = codecs.open(get_filename(), 'r', encoding="utf-8")
    lines = f.readlines()
    f.close()

    debug('Cargando paginas de %swiki' % args.lang)
    pages=set()
    for line in lines[1:]:
        #saltamos la primera linea q es el describe de sql
        pages.add(line[:-1].strip().replace("_"," "))
    debug(
        'Cargadas %i paginas de un total de %i [de %swiki]' % (
            len(pages), len(lines)-1, args.lang
        )
    )
    return pages

def load_from_toolserver():
    #this function is only available from toolserver, elsewhere you must use
    #the function load_from_pywikilib
    #pages
    sql = (
        u"""mysql -h %(l)swiki-p.db.toolserver.org -e """
        u""" "USE %(l)swiki_p;SELECT page_title FROM page WHERE page_title>='%(s)s' """
        u"""AND page_title<'%(t)s' AND page_namespace=0" """
        u"""> %(f)s""" % {
            "l": args.lang,
            "s": unicode(args.begin),
            "t": unicode(args.end),
            "f": get_filename()
        }
    )
    os.system(sql.encode("utf8"))
    debug(sql)
    f = codecs.open(get_filename(), 'r', encoding="utf-8")
    lines = f.readlines()
    f.close()

    debug('Cargando paginas de %swiki' % args.lang)
    pages=set()
    for line in lines[1:]:
        #saltamos la primera linea q es el describe de sql
        pages.add(line[:-1].strip().replace("_"," "))
    debug(
        'Cargadas %i paginas de un total de %i [de %swiki]' % (
            len(pages), len(lines)-1, args.lang
        )
    )

    return pages

def load_from_pywikilib():
    gen = pg.AllpagesPageGenerator(
        start=args.begin, includeredirects=False, site=Site(args.lang,"wikipedia")
    )
    pages = set()

    debug('Cargando paginas de %swiki' % args.lang)

    for page in gen:
        if page.title() == args.end: break
        pages.add(page.title())
    debug('Cargadas %i paginas [de %swiki]' % (len(pages), args.lang))
    return pages

def load_using_API():
    pages = set()

    debug('Cargando paginas de %swiki' % args.lang)
    params = {
        "action": "query",
        "list": "allpages",
        "apfrom": args.begin,
        "apto": args.end,
        "apnamescpace": 0,
        "apfilterredir": "nonredirects",
        "aplimit": "max"
    }
    next=True
    while next:
        data = api.GetData(params, Site(args.lang, "wikipedia"))
        next = data.has_key("query-continue") and data['query-continue']['allpages'].has_key('apcontinue')
        for page in data['query']['allpages']:
                pages.add(page['title'])
        if next:
                params['apcontinue'] = data['query-continue']['allpages']['apcontinue']
    debug('Cargadas %i paginas [de %swiki]' % (len(pages), args.lang))
    return pages

def filter_pages(titles):
    filter=set()
    e=0
    for title in titles:
        if re.search(ur"[a-z%s0-9\-.,: ]" % diacritics, title, re.I):
            ntitle = simplify_chars(title)
            if title != ntitle and ntitle not in titles:
                filter.add(ntitle)

                if len(filter) % 100 == 0:
                    debug(str(len(filter)))
                    #debug(ur"[[%s]] -> [[%s]]" % (page2, page))

                page = Page(Site(args.lang, 'wikipedia'), title)
                npage = Page(Site(args.lang, 'wikipedia'), ntitle)
                if not npage.exists():
                    if page.isRedirectPage():
                        output = u"#REDIRECT [[%s]]" % page.getRedirectTarget().title()
                    else:
                        output = u"#REDIRECT [[%s]]" % title
                    debug(output)
                    if args.edit and not args.test:
                        e+=1
                        npage.put(output, u"BOT - %s" % output)
    debug("Se han realizado %i ediciones de %i disponibles sobre %i paginas cargadas." % (e, len(filter), len(titles)))

def debug(string):
    if args.test or not args.quiet: display(string)

def main():
    t=time.time()
    debug(u"[\3{lightyellow}%s\3{default}] Empezamos." % time.strftime("%H:%M:%S"))
    try:
            if args.cache:
                    if os.path.exists(get_filename()):
                            pages = load_from_cache()
                    else:
                            debug("El fichero temporal no existe, iniciando la consulta SQL...")
                            pages = load_from_toolserver()
            elif args.piwikimedia:
                    pages = load_from_pywikilib()
            elif args.use_api:
                    pages = load_using_API()
            else:
                    pages = load_from_toolserver()
    except KeyboardInterrupt:
            debug("Cancelled by user...")
    debug(u"[\3{lightpurple}%s\3{default}] OK. Se ha tardado: %s." % (time.strftime("%H:%M:%S"), timedelta(t)[0]))
    filter_pages(pages)
    if args.remove:
            os.system("rm %s" % get_filename())

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
            description="Crea redirecciones sin acentuación de artcículos que contengan diacríticas en su título.",
            usage="%(prog)s [--lang <lang>] [--begin <A>] [--end <M>] [--path </home/emijrp/temporal/>] [--api|--cache|--pgen] [--remove]"
    )
    parser.add_argument("--lang", "-l", default="es", help="Idioma del proyecto. (Opcional, por defecto: '%(default)s'.)", metavar="es")
    parser.add_argument("--begin", "-b", default="!", type=unicode, help="Primer artículo", metavar="!")
    parser.add_argument("--end", "-e", default=u"ÿ", type=unicode, help="Último artículo", metavar="ÿ")
    parser.add_argument("--pgen", "-g", dest="piwikimedia", action="store_true", default=False, help="usar método de pagegenerator, no recomendable, es el más lento y el que más recursos consume.")
    parser.add_argument("--api", "-a", dest="use_api", action="store_true", default=False, help="usar API, recomendable si no se dispone de acceso al toolserver.")
    parser.add_argument("--cache", "-C", action="store_true", default=False, help="usar caché (ficheros temporales, solo para toolserver)")
    parser.add_argument("--edit", "-E", action="store_true", default=False, help="editar, imprescindible para que el bot realice los cambios")
    parser.add_argument("--remove", "-R", action="store_true", default=False, help="eliminar archivos temporales (solo para toolserver))")
    parser.add_argument("--path", "-H", default=None, help="ruta fichero (solo para toolserver; por defecto: /home/{USER}/temp/)", metavar="/home/{USER}/temp/")
    parser.add_argument("--quiet", "-Q", action="store_true", default=False, help="anula la información adicional durante del desarrollo del programa.")
    parser.add_argument("--test", "-T", action="store_true", default=False, help="activar modo pruebas (no permite editar y muestra toda la información adicional.)")
    args = parser.parse_args()
    try:
            main()
    except KeyboardInterrupt:
            display("Cancelled by user...")
    finally:
            stopme()