Utilisateur:RimBot/interproject.py

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
Script to check interproject links for general pages. This works by downloading the
page, and checking if a page with the same name exists on other wikis (list
specified on command line).

# TODO
#   - Check redirects to the original page
#   - Handle disambig pages on remote projects 

This script understands various command-line arguments:

    -cat           Work on all pages which are in a specific category.
                   Argument can also be given as "-cat:categoryname".

    -ref           Work on all pages that link to a certain page.
                   Argument can also be given as "-ref:referredpagetitle".

    -links         Work on all pages that are linked from a certain page.
                   Argument can also be given as "-links:linkingpagetitle".

    -new           Work on the most recent new pages on the wiki

    -subcat        When the pages to work on have been chosen by -cat, pages in
                   subcategories of the selected category are also included.
                   When -cat has not been selected, this has no effect.
    

    -file:         used as -file:filename, read a list of pages to treat
                   from the named file


    -start:        used as -start:title, specifies that the robot should
                   go alphabetically through all pages on the home wiki,
                   starting at the named page. 

    -select:       ask for  *every* link whether it should be included or not

    -ask:          ask before any change is made to the wiki
    
    -dry:          do not change wiki, just print what would have been done

    -autonomous    run the script in autonomous mode : ask no question, only
                   check if the current interproject links are valid (and if
                   not, remove them)
     
    -compare:      used as -compare:project, check that for every interproject
                   linking to project, there is a link back and dump a list
                   of pages that miss this link

    -output:       used as -output:pagename, output the result of -compare into this pagename
"""
import wikipedia, pagegenerators, catlib, config
import sys, re, string, difflib

msg = {
    'en': (u'robot: ', u'adding interproject links', u'removing interproject links', u'modifying interproject links'),
    'fr': (u'robot : ', u'ajoute les liens interprojets', u'retire les liens interprojets', u'modifie les liens interprojets'),
    }

class Global(object):
    """Container class for global settings.
       Use of globals outside of this is to be avoided."""
    select = False
    ask = False
    dry = False
    autonomous = False

    compare = None
    outputpage = None
    output = ""
    noip = ""
    sites = {}
    siblings = []
    mainpagename = None

def check_backlink(page, links):
    if not 'w' in links and not 'wikipedia' in links:
#        wikipedia.output(u'no w or wikipedia in backlinks for %s' % page.title())
        globalvar.noip += u'|-\n|[[%s]] || {{non}} || ' % page.title()
        if links:
            globalvar.noip += u'{{oui}}\n'
        else:
            globalvar.noip += u'{{non}}\n'
        return

    if 'wikipedia' in links:
        links['w'] = links['wikipedia']

    rpage = links['w'][0]
    # Make sure it's a recent version we're getting.
    rpage.site().forceLogin()
    
    site = wikipedia.getSite()

    path = rpage.site().get_address(rpage.urlname())
#    wikipedia.output(u'DBG: getting %s...' % path) 
    text = rpage.site().getUrl(path)

    globalvar.output += u'|-\n|[[%s]]||[[:%s:%s]]' % (page.title(), rpage.site().sitename(), rpage.title());

#    wikipedia.output(u'DBG: matching for <a href="//%s%s([^"]+)"' % (site.hostname(), site.nice_get_address('')))
    interprojectR = re.compile(r'<a href="//%s%s([^"]+)"' % (site.hostname(), site.nice_get_address('')))

    matches = interprojectR.findall(text)

    if not matches:
#       wikipedia.output(u'DBG: text for %s is' % page.title())
#	wikipedia.output(u'%s' % text)
        globalvar.output += u'|| {{non}} || {{non}}\n'
        return
    else:
        globalvar.output += u'|| {{oui}}'

    backlink = False 
    for m in matches:
#        wikipedia.output(u'DBG: %s ?= %s' % (m, page.urlname))
        if m == page.urlname():
            backlink = True

    if not backlink:
        globalvar.output += u'|| {{non}}\n'
    else:
        globalvar.output += u'|| {{oui}}\n'


def parse_interproject(title, text):
    # Parse text to get the existing interproject template, if any
    Rtmpl = re.compile(ur'{{interprojet(.*?)}}', re.IGNORECASE | re.MULTILINE | re.DOTALL)

    i = Rtmpl.finditer(text)

    # List of links found
    lists = []
    # List of (start, end) tuples where start is the beginning of the match and
    # end... the end
    offset = 0
    for m in i:
        if lists:
            wikipedia.output(u'WARNING: %s includes {{interprojet}} more than once!'
                             % title)

        s = m.group(1)

        # Cleanup : remove unneeded whitespaces, pipes and newlines
        s = s.replace("\n", "")
        s = s.replace("\r", "")
        nowhite = re.compile('[\s|]*([=|])[\s]*')
        s = nowhite.sub('\\1', s)

        projects = s.split('|')
#        wikipedia.output(u'projects : %s' % projects)
        lists.append(projects)

        text = text[:m.start() - offset] + text[m.end() - offset:]
        offset += (m.end() - m.start()) 

    # Merge lists (linear time)
    d = {}
    for s in lists:
        for x in s:
            d[x] = 1
    projects = d.keys()
    del d

    return (projects, text)

def check_interprojects(title, projects, links):
    # Check if the pages specified in the template do exist
    # When they don't, ask or remove automatically
    flags = []
    site = wikipedia.getSite()
    for project in projects:
        if not project: 
            continue

        if (project.startswith('nolink')) or (project.startswith('etiq')):
            # Remove old, deprecated tags.
            #    flags.append(project)
            continue

        # get explicit page name if given
        val = None
        l = project.find('=')
        if l != -1:
            val = project[l+1:]
            project = project[:l]
        else:
            val = title

        # "|code=" is ignored by the template, so ignore it (and remove it) 
        if not val:
            continue

        l = project.find('-')
        if l != -1:
            pproject = project[:l]
            lang = project[l+1:]
        else:
            pproject = project
            lang = site.lang
            
        if not pproject in site.family.known_families:
            wikipedia.output(u'WARNING: %s has interproject link to unknown %s project'
                             % (title, project))
            continue

        family = site.family.known_families[pproject]
        if not project in globalvar.sites:
            if family in ['meta', 'commons']:
                lang = family

            try:
                globalvar.sites[project] = wikipedia.getSite(fam = family, code = lang)
            except ValueError:
                wikipedia.output(u'WARNING: %s has interproject link to known, but unimplemented family : %s'
                                 % (title, project))
                continue

#        wikipedia.output(u'getting page %s from project %s' % (val, project))

        rpage = wikipedia.Page(globalvar.sites[project], val)
        if not rpage.exists():
            if globalvar.compare or globalvar.autonomous:
                continue
            c = wikipedia.inputChoice(u'WARNING: %s has link to nonexisting page [[%s:%s]]. Remove it?'
                                      % (title, project, val),
                                      ['Yes', 'No'], ['y', 'n'])
            if c == 'y':
                continue

        if rpage.isRedirectPage():
            rpage = rpage.getRedirectTarget()
            if (project in links) and (rpage in links[project]):
                continue

        wikipedia.output(u'Adding new interproject to %s (%s)'
                         % (project, val) )
        try:
            links[project].append(rpage)
        except KeyError: links[project] = [rpage]

    return flags

def check_projects(title, links):
    # Check the list of projects for articles with the same name
    # FIXME: (or with names redirecting on this one on local wikis)

    for s in globalvar.siblings:
        rpage = wikipedia.Page(globalvar.sites[s], title)
        if rpage.isRedirectPage():
            rpage = rpage.getRedirectTarget()
        if (s in links) and (rpage in links[s]):
            wikipedia.output(u'%s already in %s links' % (title, s))
            continue
        if rpage.exists():
            wikipedia.output(u'Adding new interproject to %s (%s)'
                             % (s, rpage.title()))
            try:
                links[s].append(rpage)
            except KeyError: links[s] = [rpage]

def choose_links(title, links):
    # Choose which link should be added and which should not.
    for proj in links:
        l = links[proj]

        # Exclude non-article pages.
        for p in l:
            try:
                if p.get().find('[[') == -1:
                    l.remove(p)
            except:
                l.remove(p)
        
        if (len(l) <= 1) and not globalvar.select:
            if len(l) == 1:
                links[proj] = [l[0]]
            continue

        wikipedia.output("The following pages have been found for %s" % title)
        i = 0
        for p in l:
            i += 1
            wikipedia.output(u"  (%d) %s" % (i, p.aslink(True)))

        if globalvar.autonomous:
            wikipedia.output("More than one page found, not changing articles");
            continue

        while True:
            c = wikipedia.input(u"Which page should be used [number, (n)one]? :")
            if c:
                if c == 'n':
                    rpage = None
                    break
                elif c.isdigit():
                    c = int(c)
                    try:
                        rpage = l[c-1]
                    except IndexError:
                        pass
                    else:
                        break
        # Use a list to make it easy to compare with oldlinks.  
        links[proj] = [rpage]

def generate_interproject(links, flags):
    if not links:
        return ""

    tmpl = "{{interprojet"
    for s in flags:
        tmpl += "|%s" % s

    i = 0
    for s in links:
        if not links[s] or not links[s][0]:
            continue
        i = i + 1
        tmpl += "|%s=%s" % (s, links[s][0].title())
        
    if i == 0:
        tmpl = None
    else:
        tmpl += "}}"

    return tmpl

def getDefaultSort(text):
    Rtmpl = re.compile(ur'(\{\{ *?(DEFAULTSORT|CLEDETRI|CLEFDETRI)\s*:.+? *}})', re.DOTALL)
    match = Rtmpl.search(text)
    if match:
        return match.group(1) + '\r\n'
    else:
        return ""

def removeDefaultSort(text):
    Rtmpl = re.compile(ur'\{\{ *?(DEFAULTSORT|CLEDETRI|CLEFDETRI)\s*:.+? *}}', re.DOTALL)
    text = wikipedia.replaceExcept(text, Rtmpl, '', ['nowiki', 'comment', 'math', 'pre'])
    return text.strip()

def commit(page, text, interproject, oldtext):
    site = wikipedia.getSite()
    
    categories = wikipedia.getCategoryLinks(text, site = site)
    defaultsort = getDefaultSort(text)
    interwiki = wikipedia.getLanguageLinks(text)

    text = wikipedia.removeCategoryLinks(text, site)
    text = wikipedia.removeLanguageLinks(text, site)
    text = removeDefaultSort(text)

    if interproject:
        text = text + '\r\n\r\n' + interproject

#    wikipedia.output(u'avant %s apres' % defaultsort)
#    wikipedia.output(u'avant %s apres' % wikipedia.categoryFormat(categories))
#    wikipedia.output(u'avant %s apres' % (defaultsort + wikipedia.categoryFormat(categories)))

    text = wikipedia.replaceCategoryLinks(text, categories, site = site, prepend = defaultsort)
#    wikipedia.output(u'avant %s apres' % text)
    text = wikipedia.replaceLanguageLinks(text, interwiki, site = site)
#    wikipedia.output(u'avant %s apres' % text)
    #    diff = wikipedia.showDiff(oldtext, text)
    diff = ""
    for l in difflib.unified_diff(oldtext.splitlines(), text.splitlines()):
        diff += l + '\n'

    wikipedia.output(u'%s' % diff)
    return [text, diff]

def workon(page):
    if page.title() == globalvar.mainpagename:
        return
    # Redirects point to pages which would be handled at some point
    # anyway, so ignore them.
    if page.isRedirectPage():
        return

    wikipedia.output(u'handling %s' % page.title())

    try:    
        text = page.get()
    except wikipedia.IsRedirectPage:
        pagename = page.getRedirectTarget()
        page = wikipedia.Page(page.site(), pagename)
	try:
	    text = page.get()
	except wikipedia.NoPage:
	    wikipedia.output(u'Broken redirect to %s' % pagename)
	    return

#    wikipedia.output(u'text : %s' % text)

    # Hash, key is project link code ('q', 'n'...)
    links = {}

    (projects, text) = parse_interproject(page.title(), text)
    flags = check_interprojects(page.title(), projects, links)

    if globalvar.compare:
        check_backlink(page, links)
        return

    oldlinks = links.copy()
    if not globalvar.autonomous:
        check_projects(page.title(), links)

    choose_links(page.title(), links)
    
    interproject = generate_interproject(links, flags)

    [text, diff] = commit(page, text, interproject, page.get())

    # Generate edit summary
    lang = page.site().lang
    if not projects:
        summary = wikipedia.translate(lang, msg)[1]
    else:
        if not links:
            summary = wikipedia.translate(lang, msg)[2]
        else:
            summary = wikipedia.translate(lang, msg)[3]

    wikipedia.output(u'diff : %s' % diff)
    if diff:
        wikipedia.output(u'diff!')
    else:
        wikipedia.output(u'notdiff!')

#    wikipedia.output(u'new text : %s' % text)
    # Do not do unneeded changes in autonomous mode.
    if (globalvar.autonomous and (oldlinks != links)) or diff:
        if globalvar.ask:
            answer = wikipedia.inputChoice(u'Submit?', ['Yes', 'No'], ['y', 'n'])
            if answer == 'n':
                return
        try:
            page.put(text, comment = wikipedia.translate(lang, msg)[0] + summary)
        except wikipedia.LockedPage:
            wikipedia.output(u'Page %s is locked. Skipping.' % page.title())

class Main(object):

    # Options
    __start = None
    __number = None
    ## Which page generator to use
    __workonnew = False
    __catname = None
    __catrecurse = False
    __linkpagetitle = None
    __refpagetitle = None
    __textfile = None
    __pagetitles = []

    def parse(self):
        # Parse options

        for arg in wikipedia.handleArgs():
            if arg.startswith('-ref'):
                if len(arg) == 4:
                    self.__refpagetitle = wikipedia.input(u'Links to which page should be processed?')
                else:
                    self.__refpagetitle = arg[5:]
            elif arg.startswith('-cat'):
                if len(arg) == 4:
                    self.__catname = wikipedia.input(u'Please enter the category name:');
                else:
                    self.__catname = arg[5:]
            elif arg.startswith('-subcat'):
                self.__catrecurse = True
            elif arg.startswith('-links'):
                if len(arg) == 6:
                    self.__linkpagetitle = wikipedia.input(u'Links from which page should be processed?')
                else:
                    self.__linkpagetitle = arg[7:]
            elif arg.startswith('-file:'):
                if len(arg) == 5:
                    self.__textfile = wikipedia.input(u'File to read pages from?')
                else:
                    self.__textfile = arg[6:]
            elif arg == '-new':
                self.__workonnew = True
            elif arg.startswith('-start:'):
                if len(arg) == 6:
                    self.__start = wikipedia.input(u'Which page to start from: ')
                else:
                    self.__start = arg[7:]
            elif arg.startswith('-number:'):
                if len(arg) == 7:
                    self.__number = int(wikipedia.input(u'Number of pages to parse: '))
                else:
                    self.__number = int(arg[8:])
            elif arg == '-select':
                globalvar.select = True
            elif arg == '-ask':
                globalvar.ask = True
            elif arg == '-dry':
                globalvar.dry = True
            elif arg == '-autonomous':
                globalvar.autonomous = True
            elif arg.startswith('-compare:'):
                if len(arg) == 8:
                    globalvar.compare = wikipedia.input(u'Project to compare to: ')
                else:
                    globalvar.compare = arg[9:]
            elif arg.startswith('-output:'):
                if len(arg) == 7:
                    globalvar.outputpage = wikipedia.input(u'Page to print output to: ')
                else:
                    globalvar.outputpage = arg[8:]
            else:
                self.__pagetitles.append(arg)

    def generator(self):
        # Choose which generator to use according to options.

        pagegen = None

        if self.__workonnew:
            if not self.__number:
                self.__number = config.special_page_limit
            pagegen = pagegenerators.NewpagesPageGenerator(number = self.__number)
                
        elif self.__refpagetitle:
            refpage = wikipedia.Page(wikipedia.getSite(), self.__refpagetitle)
            pagegen = pagegenerators.ReferringPageGenerator(refpage)

        elif self.__linkpagetitle:
            linkpage = wikipedia.Page(wikipedia.getSite(), self.__linkpagetitle)
            pagegen = pagegenerators.LinkedPageGenerator(linkpage)

        elif self.__catname:
            cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % self.__catname)

            if self.__start:
                pagegen = pagegenerators.CategorizedPageGenerator(cat, recurse = self.__catrecurse, start = self.__start)
            else:
                pagegen = pagegenerators.CategorizedPageGenerator(cat, recurse = self.__catrecurse)

        elif self.__textfile:
            pagegen = pagegenerators.TextfilePageGenerator(self.__textfile)

        else:
            if not self.__start:
                self.__start = '!'
            namespace = wikipedia.Page(wikipedia.getSite(), self.__start).namespace()
            start = wikipedia.Page(wikipedia.getSite(), self.__start).titleWithoutNamespace()

            pagegen = pagegenerators.AllpagesPageGenerator(start, namespace)

        return pagegen

    def getSites(self, site):
        # Get Site objects for all projects, since we are going to need them no
        # matter what.
        # NOTE: we are excluding wikinews because it doesn't make sense to look for
        # the same names in wikinews AFAICS.
        
        # FIXME: right now, we are excluding wiktionary because it's case-sensitive.
        # Handle that.

        # That's a bad idea if you want it to be able to show the "has
        # interprojects?" column.
#        if globalvar.compare:
#            globalvar.siblings = [globalvar.compare]
#        else:
        globalvar.siblings = ['b', 'commons', 's', 'w']
        for s in globalvar.siblings:
            if s == 'commons':
                code = 'commons'
            else:
                code = site.lang

            globalvar.sites[s] = wikipedia.getSite(code, site.family.known_families[s])

    def main(self):
        wikipedia.setLogfileStatus(True, 'interproject.log')
                    
        # ensure that we don't try to change main page
        try:
            site = wikipedia.getSite()
            globalvar.mainpagename = site.family.mainpages[site.language()]
        except:
            wikipedia.output(u'Missing main page name')


        # Parse command line options
        self.parse()
        
        # Fill globalvar.sites
        self.getSites(site)

        pagegen = self.generator()

        generator = None
        if self.__pagetitles:
            pages = []
            for p in self.__pagetitles:
                try:
                    pages.append(wikipedia.Page(wikipedia.getSite(), p))
                except wikipedia.NoPage: pass
            generator = pagegenerators.PreloadingGenerator(iter(pages))
        else:
            generator = pagegenerators.PreloadingGenerator(pagegen)

        for page in generator:
            workon(page)

        if globalvar.compare:
            # Add wikipedia comparison
            globalvar.output = "{|\n|-\n!Article||Article Wikipedia||Lien vers Wikiquote ?||Lien vers l'article correspondant ?\n" + globalvar.output
            globalvar.output += "|}\n"

            # Add articles without interprojects
            globalvar.output += "\n== Articles sans interprojet(s) ==\n\n"
            globalvar.output += "{|\n|-\n!Article||Lien vers Wikipedia ?||Lien interprojet ?\n"
            globalvar.output += globalvar.noip
            globalvar.output += "|}\n"
            
            try:
                outputpage = wikipedia.Page(site, globalvar.outputpage)
                outputpage.put(globalvar.output, comment = "Mise a jour de la liste")
            except:
                wikipedia.output(u'Getting/Modifying page %s failed, generated output was:\n%s' % (globalvar.outputpage, globalvar.output))

globalvar = Global()
try:
    if __name__ == "__main__":
        Main().main()
finally:
    wikipedia.stopme()