#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Script to check interproject links for general pages. This works by downloading the
page, and checking if a page with the same name exists on other wikis (list
specified on command line).
# TODO
# - Check redirects to the original page
# - Handle disambig pages on remote projects
This script understands various command-line arguments:
-cat Work on all pages which are in a specific category.
Argument can also be given as "-cat:categoryname".
-ref Work on all pages that link to a certain page.
Argument can also be given as "-ref:referredpagetitle".
-links Work on all pages that are linked from a certain page.
Argument can also be given as "-links:linkingpagetitle".
-new Work on the most recent new pages on the wiki
-subcat When the pages to work on have been chosen by -cat, pages in
subcategories of the selected category are also included.
When -cat has not been selected, this has no effect.
-file: used as -file:filename, read a list of pages to treat
from the named file
-start: used as -start:title, specifies that the robot should
go alphabetically through all pages on the home wiki,
starting at the named page.
-select: ask for *every* link whether it should be included or not
-ask: ask before any change is made to the wiki
-dry: do not change wiki, just print what would have been done
-autonomous run the script in autonomous mode : ask no question, only
check if the current interproject links are valid (and if
not, remove them)
-compare: used as -compare:project, check that for every interproject
linking to project, there is a link back and dump a list
of pages that miss this link
-output: used as -output:pagename, output the result of -compare into this pagename
"""
import wikipedia, pagegenerators, catlib, config
import sys, re, string, difflib
msg = {
'en': (u'robot: ', u'adding interproject links', u'removing interproject links', u'modifying interproject links'),
'fr': (u'robot : ', u'ajoute les liens interprojets', u'retire les liens interprojets', u'modifie les liens interprojets'),
}
class Global(object):
"""Container class for global settings.
Use of globals outside of this is to be avoided."""
select = False
ask = False
dry = False
autonomous = False
compare = None
outputpage = None
output = ""
noip = ""
sites = {}
siblings = []
mainpagename = None
def check_backlink(page, links):
if not 'w' in links and not 'wikipedia' in links:
# wikipedia.output(u'no w or wikipedia in backlinks for %s' % page.title())
globalvar.noip += u'|-\n|[[%s]] || {{non}} || ' % page.title()
if links:
globalvar.noip += u'{{oui}}\n'
else:
globalvar.noip += u'{{non}}\n'
return
if 'wikipedia' in links:
links['w'] = links['wikipedia']
rpage = links['w'][0]
# Make sure it's a recent version we're getting.
rpage.site().forceLogin()
site = wikipedia.getSite()
path = rpage.site().get_address(rpage.urlname())
# wikipedia.output(u'DBG: getting %s...' % path)
text = rpage.site().getUrl(path)
globalvar.output += u'|-\n|[[%s]]||[[:%s:%s]]' % (page.title(), rpage.site().sitename(), rpage.title());
# wikipedia.output(u'DBG: matching for <a href="//%s%s([^"]+)"' % (site.hostname(), site.nice_get_address('')))
interprojectR = re.compile(r'<a href="//%s%s([^"]+)"' % (site.hostname(), site.nice_get_address('')))
matches = interprojectR.findall(text)
if not matches:
# wikipedia.output(u'DBG: text for %s is' % page.title())
# wikipedia.output(u'%s' % text)
globalvar.output += u'|| {{non}} || {{non}}\n'
return
else:
globalvar.output += u'|| {{oui}}'
backlink = False
for m in matches:
# wikipedia.output(u'DBG: %s ?= %s' % (m, page.urlname))
if m == page.urlname():
backlink = True
if not backlink:
globalvar.output += u'|| {{non}}\n'
else:
globalvar.output += u'|| {{oui}}\n'
def parse_interproject(title, text):
# Parse text to get the existing interproject template, if any
Rtmpl = re.compile(ur'{{interprojet(.*?)}}', re.IGNORECASE | re.MULTILINE | re.DOTALL)
i = Rtmpl.finditer(text)
# List of links found
lists = []
# List of (start, end) tuples where start is the beginning of the match and
# end... the end
offset = 0
for m in i:
if lists:
wikipedia.output(u'WARNING: %s includes {{interprojet}} more than once!'
% title)
s = m.group(1)
# Cleanup : remove unneeded whitespaces, pipes and newlines
s = s.replace("\n", "")
s = s.replace("\r", "")
nowhite = re.compile('[\s|]*([=|])[\s]*')
s = nowhite.sub('\\1', s)
projects = s.split('|')
# wikipedia.output(u'projects : %s' % projects)
lists.append(projects)
text = text[:m.start() - offset] + text[m.end() - offset:]
offset += (m.end() - m.start())
# Merge lists (linear time)
d = {}
for s in lists:
for x in s:
d[x] = 1
projects = d.keys()
del d
return (projects, text)
def check_interprojects(title, projects, links):
# Check if the pages specified in the template do exist
# When they don't, ask or remove automatically
flags = []
site = wikipedia.getSite()
for project in projects:
if not project:
continue
if (project.startswith('nolink')) or (project.startswith('etiq')):
# Remove old, deprecated tags.
# flags.append(project)
continue
# get explicit page name if given
val = None
l = project.find('=')
if l != -1:
val = project[l+1:]
project = project[:l]
else:
val = title
# "|code=" is ignored by the template, so ignore it (and remove it)
if not val:
continue
l = project.find('-')
if l != -1:
pproject = project[:l]
lang = project[l+1:]
else:
pproject = project
lang = site.lang
if not pproject in site.family.known_families:
wikipedia.output(u'WARNING: %s has interproject link to unknown %s project'
% (title, project))
continue
family = site.family.known_families[pproject]
if not project in globalvar.sites:
if family in ['meta', 'commons']:
lang = family
try:
globalvar.sites[project] = wikipedia.getSite(fam = family, code = lang)
except ValueError:
wikipedia.output(u'WARNING: %s has interproject link to known, but unimplemented family : %s'
% (title, project))
continue
# wikipedia.output(u'getting page %s from project %s' % (val, project))
rpage = wikipedia.Page(globalvar.sites[project], val)
if not rpage.exists():
if globalvar.compare or globalvar.autonomous:
continue
c = wikipedia.inputChoice(u'WARNING: %s has link to nonexisting page [[%s:%s]]. Remove it?'
% (title, project, val),
['Yes', 'No'], ['y', 'n'])
if c == 'y':
continue
if rpage.isRedirectPage():
rpage = rpage.getRedirectTarget()
if (project in links) and (rpage in links[project]):
continue
wikipedia.output(u'Adding new interproject to %s (%s)'
% (project, val) )
try:
links[project].append(rpage)
except KeyError: links[project] = [rpage]
return flags
def check_projects(title, links):
# Check the list of projects for articles with the same name
# FIXME: (or with names redirecting on this one on local wikis)
for s in globalvar.siblings:
rpage = wikipedia.Page(globalvar.sites[s], title)
if rpage.isRedirectPage():
rpage = rpage.getRedirectTarget()
if (s in links) and (rpage in links[s]):
wikipedia.output(u'%s already in %s links' % (title, s))
continue
if rpage.exists():
wikipedia.output(u'Adding new interproject to %s (%s)'
% (s, rpage.title()))
try:
links[s].append(rpage)
except KeyError: links[s] = [rpage]
def choose_links(title, links):
# Choose which link should be added and which should not.
for proj in links:
l = links[proj]
# Exclude non-article pages.
for p in l:
try:
if p.get().find('[[') == -1:
l.remove(p)
except:
l.remove(p)
if (len(l) <= 1) and not globalvar.select:
if len(l) == 1:
links[proj] = [l[0]]
continue
wikipedia.output("The following pages have been found for %s" % title)
i = 0
for p in l:
i += 1
wikipedia.output(u" (%d) %s" % (i, p.aslink(True)))
if globalvar.autonomous:
wikipedia.output("More than one page found, not changing articles");
continue
while True:
c = wikipedia.input(u"Which page should be used [number, (n)one]? :")
if c:
if c == 'n':
rpage = None
break
elif c.isdigit():
c = int(c)
try:
rpage = l[c-1]
except IndexError:
pass
else:
break
# Use a list to make it easy to compare with oldlinks.
links[proj] = [rpage]
def generate_interproject(links, flags):
if not links:
return ""
tmpl = "{{interprojet"
for s in flags:
tmpl += "|%s" % s
i = 0
for s in links:
if not links[s] or not links[s][0]:
continue
i = i + 1
tmpl += "|%s=%s" % (s, links[s][0].title())
if i == 0:
tmpl = None
else:
tmpl += "}}"
return tmpl
def getDefaultSort(text):
Rtmpl = re.compile(ur'(\{\{ *?(DEFAULTSORT|CLEDETRI|CLEFDETRI)\s*:.+? *}})', re.DOTALL)
match = Rtmpl.search(text)
if match:
return match.group(1) + '\r\n'
else:
return ""
def removeDefaultSort(text):
Rtmpl = re.compile(ur'\{\{ *?(DEFAULTSORT|CLEDETRI|CLEFDETRI)\s*:.+? *}}', re.DOTALL)
text = wikipedia.replaceExcept(text, Rtmpl, '', ['nowiki', 'comment', 'math', 'pre'])
return text.strip()
def commit(page, text, interproject, oldtext):
site = wikipedia.getSite()
categories = wikipedia.getCategoryLinks(text, site = site)
defaultsort = getDefaultSort(text)
interwiki = wikipedia.getLanguageLinks(text)
text = wikipedia.removeCategoryLinks(text, site)
text = wikipedia.removeLanguageLinks(text, site)
text = removeDefaultSort(text)
if interproject:
text = text + '\r\n\r\n' + interproject
# wikipedia.output(u'avant %s apres' % defaultsort)
# wikipedia.output(u'avant %s apres' % wikipedia.categoryFormat(categories))
# wikipedia.output(u'avant %s apres' % (defaultsort + wikipedia.categoryFormat(categories)))
text = wikipedia.replaceCategoryLinks(text, categories, site = site, prepend = defaultsort)
# wikipedia.output(u'avant %s apres' % text)
text = wikipedia.replaceLanguageLinks(text, interwiki, site = site)
# wikipedia.output(u'avant %s apres' % text)
# diff = wikipedia.showDiff(oldtext, text)
diff = ""
for l in difflib.unified_diff(oldtext.splitlines(), text.splitlines()):
diff += l + '\n'
wikipedia.output(u'%s' % diff)
return [text, diff]
def workon(page):
if page.title() == globalvar.mainpagename:
return
# Redirects point to pages which would be handled at some point
# anyway, so ignore them.
if page.isRedirectPage():
return
wikipedia.output(u'handling %s' % page.title())
try:
text = page.get()
except wikipedia.IsRedirectPage:
pagename = page.getRedirectTarget()
page = wikipedia.Page(page.site(), pagename)
try:
text = page.get()
except wikipedia.NoPage:
wikipedia.output(u'Broken redirect to %s' % pagename)
return
# wikipedia.output(u'text : %s' % text)
# Hash, key is project link code ('q', 'n'...)
links = {}
(projects, text) = parse_interproject(page.title(), text)
flags = check_interprojects(page.title(), projects, links)
if globalvar.compare:
check_backlink(page, links)
return
oldlinks = links.copy()
if not globalvar.autonomous:
check_projects(page.title(), links)
choose_links(page.title(), links)
interproject = generate_interproject(links, flags)
[text, diff] = commit(page, text, interproject, page.get())
# Generate edit summary
lang = page.site().lang
if not projects:
summary = wikipedia.translate(lang, msg)[1]
else:
if not links:
summary = wikipedia.translate(lang, msg)[2]
else:
summary = wikipedia.translate(lang, msg)[3]
wikipedia.output(u'diff : %s' % diff)
if diff:
wikipedia.output(u'diff!')
else:
wikipedia.output(u'notdiff!')
# wikipedia.output(u'new text : %s' % text)
# Do not do unneeded changes in autonomous mode.
if (globalvar.autonomous and (oldlinks != links)) or diff:
if globalvar.ask:
answer = wikipedia.inputChoice(u'Submit?', ['Yes', 'No'], ['y', 'n'])
if answer == 'n':
return
try:
page.put(text, comment = wikipedia.translate(lang, msg)[0] + summary)
except wikipedia.LockedPage:
wikipedia.output(u'Page %s is locked. Skipping.' % page.title())
class Main(object):
# Options
__start = None
__number = None
## Which page generator to use
__workonnew = False
__catname = None
__catrecurse = False
__linkpagetitle = None
__refpagetitle = None
__textfile = None
__pagetitles = []
def parse(self):
# Parse options
for arg in wikipedia.handleArgs():
if arg.startswith('-ref'):
if len(arg) == 4:
self.__refpagetitle = wikipedia.input(u'Links to which page should be processed?')
else:
self.__refpagetitle = arg[5:]
elif arg.startswith('-cat'):
if len(arg) == 4:
self.__catname = wikipedia.input(u'Please enter the category name:');
else:
self.__catname = arg[5:]
elif arg.startswith('-subcat'):
self.__catrecurse = True
elif arg.startswith('-links'):
if len(arg) == 6:
self.__linkpagetitle = wikipedia.input(u'Links from which page should be processed?')
else:
self.__linkpagetitle = arg[7:]
elif arg.startswith('-file:'):
if len(arg) == 5:
self.__textfile = wikipedia.input(u'File to read pages from?')
else:
self.__textfile = arg[6:]
elif arg == '-new':
self.__workonnew = True
elif arg.startswith('-start:'):
if len(arg) == 6:
self.__start = wikipedia.input(u'Which page to start from: ')
else:
self.__start = arg[7:]
elif arg.startswith('-number:'):
if len(arg) == 7:
self.__number = int(wikipedia.input(u'Number of pages to parse: '))
else:
self.__number = int(arg[8:])
elif arg == '-select':
globalvar.select = True
elif arg == '-ask':
globalvar.ask = True
elif arg == '-dry':
globalvar.dry = True
elif arg == '-autonomous':
globalvar.autonomous = True
elif arg.startswith('-compare:'):
if len(arg) == 8:
globalvar.compare = wikipedia.input(u'Project to compare to: ')
else:
globalvar.compare = arg[9:]
elif arg.startswith('-output:'):
if len(arg) == 7:
globalvar.outputpage = wikipedia.input(u'Page to print output to: ')
else:
globalvar.outputpage = arg[8:]
else:
self.__pagetitles.append(arg)
def generator(self):
# Choose which generator to use according to options.
pagegen = None
if self.__workonnew:
if not self.__number:
self.__number = config.special_page_limit
pagegen = pagegenerators.NewpagesPageGenerator(number = self.__number)
elif self.__refpagetitle:
refpage = wikipedia.Page(wikipedia.getSite(), self.__refpagetitle)
pagegen = pagegenerators.ReferringPageGenerator(refpage)
elif self.__linkpagetitle:
linkpage = wikipedia.Page(wikipedia.getSite(), self.__linkpagetitle)
pagegen = pagegenerators.LinkedPageGenerator(linkpage)
elif self.__catname:
cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % self.__catname)
if self.__start:
pagegen = pagegenerators.CategorizedPageGenerator(cat, recurse = self.__catrecurse, start = self.__start)
else:
pagegen = pagegenerators.CategorizedPageGenerator(cat, recurse = self.__catrecurse)
elif self.__textfile:
pagegen = pagegenerators.TextfilePageGenerator(self.__textfile)
else:
if not self.__start:
self.__start = '!'
namespace = wikipedia.Page(wikipedia.getSite(), self.__start).namespace()
start = wikipedia.Page(wikipedia.getSite(), self.__start).titleWithoutNamespace()
pagegen = pagegenerators.AllpagesPageGenerator(start, namespace)
return pagegen
def getSites(self, site):
# Get Site objects for all projects, since we are going to need them no
# matter what.
# NOTE: we are excluding wikinews because it doesn't make sense to look for
# the same names in wikinews AFAICS.
# FIXME: right now, we are excluding wiktionary because it's case-sensitive.
# Handle that.
# That's a bad idea if you want it to be able to show the "has
# interprojects?" column.
# if globalvar.compare:
# globalvar.siblings = [globalvar.compare]
# else:
globalvar.siblings = ['b', 'commons', 's', 'w']
for s in globalvar.siblings:
if s == 'commons':
code = 'commons'
else:
code = site.lang
globalvar.sites[s] = wikipedia.getSite(code, site.family.known_families[s])
def main(self):
wikipedia.setLogfileStatus(True, 'interproject.log')
# ensure that we don't try to change main page
try:
site = wikipedia.getSite()
globalvar.mainpagename = site.family.mainpages[site.language()]
except:
wikipedia.output(u'Missing main page name')
# Parse command line options
self.parse()
# Fill globalvar.sites
self.getSites(site)
pagegen = self.generator()
generator = None
if self.__pagetitles:
pages = []
for p in self.__pagetitles:
try:
pages.append(wikipedia.Page(wikipedia.getSite(), p))
except wikipedia.NoPage: pass
generator = pagegenerators.PreloadingGenerator(iter(pages))
else:
generator = pagegenerators.PreloadingGenerator(pagegen)
for page in generator:
workon(page)
if globalvar.compare:
# Add wikipedia comparison
globalvar.output = "{|\n|-\n!Article||Article Wikipedia||Lien vers Wikiquote ?||Lien vers l'article correspondant ?\n" + globalvar.output
globalvar.output += "|}\n"
# Add articles without interprojects
globalvar.output += "\n== Articles sans interprojet(s) ==\n\n"
globalvar.output += "{|\n|-\n!Article||Lien vers Wikipedia ?||Lien interprojet ?\n"
globalvar.output += globalvar.noip
globalvar.output += "|}\n"
try:
outputpage = wikipedia.Page(site, globalvar.outputpage)
outputpage.put(globalvar.output, comment = "Mise a jour de la liste")
except:
wikipedia.output(u'Getting/Modifying page %s failed, generated output was:\n%s' % (globalvar.outputpage, globalvar.output))
globalvar = Global()
try:
if __name__ == "__main__":
Main().main()
finally:
wikipedia.stopme()