User:WindBOT/Filters: Difference between revisions
Jump to navigation
Jump to search
Implement
mNo edit summary |
→Semantic filters: I hope this won't break anything! |
||
| Line 19: | Line 19: | ||
== Semantic filters == | == Semantic filters == | ||
addSafeFilter( | |||
wordFilter(u'Portal Wiki'), # Team Fortress Wiki | |||
) | |||
== Language-specific filters == | == Language-specific filters == | ||
Revision as of 14:16, 27 April 2011
How to disable a filter
If the bot is malfunctioning, chances are that the problem lies in one of these blocks of code. Thus, instead of shutting down the whole bot, it would be wiser to disable only the chunk of code that is misbehaving. To make the bot ignore a certain line, add a "#" in front of it:
# This line will be ignored
If there are multiple lines, wrap them inside triple-quotes (you still need to put the two spaces at the beginning of the line:
"""This line will be ignored and this one as well and this one is cake and the previous one was a lie but it was still ignored"""
If all else fails, you can simply delete the block from the page. The bot can't come up with code by itself yet, so it won't run anything. Or, if the problem really is elsewhere, block the bot.
Page filters
addPageFilter(r'^user:', r'(?:talk|help|wiki|template):')
Semantic filters
addSafeFilter(
wordFilter(u'Portal Wiki'), # Team Fortress Wiki
)
Language-specific filters
None yet~
Link filters
Wikipedia links filter
def wikipediaLinks(link, **kwargs):
wikipediaRegex = compileRegex(r'^https?://(?:(\w+)\.)?wikipedia\.org/wiki/(\S+)')
if link.getType() == u'external':
linkInfo = wikipediaRegex.search(link.getLink())
if linkInfo:
link.setType(u'internal')
try:
wikiPage = urllib2.unquote(str(linkInfo.group(2))).decode('utf8', 'ignore').replace(u'_', ' ')
except:
wikiPage = u(linkInfo.group(2)).replace(u'_', ' ')
if not linkInfo.group(1) or linkInfo.group(1).lower() == u'en':
link.setLink(u'Wikipedia:' + wikiPage) # English Wikipedia
else:
link.setLink(u'Wikipedia:' + linkInfo.group(1).lower() + u':' + wikiPage) # Non-english Wikipedia
if link.getLabel() is None:
link.setLabel(u'(Wikipedia)')
return link
addLinkFilter(wikipediaLinks)
Remove trailing slashes from internal links
def removeTrailingSlash(l, **kwargs):
if l.getType() != u'internal':
return l
if l.getLink()[-1] == '/':
l.setLink(l.getLink()[:-1])
return l
addLinkFilter(removeTrailingSlash)
Template filters
Template renaming
def templateRenameMapping(t, **kwargs):
templateMap = {
# Format goes like this (without the "#" in front obviously):
#'Good template name': ['Bad template lowercase name 1', 'Bad template lowercase name 2', 'Bad template lowercase name 3'],
# Last line has no comma at the end
'Crush': ['pngcrush']
}
for n in templateMap:
if t.getName().lower() in templateMap[n]:
t.setName(n)
return t
addTemplateFilter(templateRenameMapping)
Remove useless templates
def removeUselessTemplate(t, **kwargs):
if t.getName().lower() in (u'targeted', u'languages'):
return None # Delete template
return t
addTemplateFilter(removeUselessTemplate)
Filter parameters of certain templates
def templateParamFilter(t, **kwargs):
params = { # Map: 'lowercase template name': ['list', 'of', 'params', 'to', 'filter']
'patch layout': ['before', 'after', 'current'],
'item infobox': ['released']
}
if t.getName().lower() not in params:
return t
for p in params[t.getName().lower()]:
if t.getParam(p):
t.setParam(p, fixContent(t.getParam(p), **kwargs))
return t
addTemplateFilter(templateParamFilter)
Remove obsolete parameters
def obsoleteParameterFilter(t, **kwargs):
params = { # Map: 'lowercase template name': ['list', 'of', 'params', 'to', 'delete']
}
if t.getName().lower() not in params:
return t
for p in params[t.getName().lower()]:
p = u(p)
if p.find(u'#n') != -1:
for i in range(10):
t.delParam(p.replace(u'#n', str(i)))
else:
t.delParam(p)
return t
addTemplateFilter(obsoleteParameterFilter)
Implement {{Dictionary}}
class DictionaryUpdater:
def __init__(self):
self.subpageTemplateLang = """{{#switch:{{{lang|{{SUBPAGENAME}}}}}|%options%}}<noinclude><hr style="margin: 1em 0em;" /><div style="font-size: 95%;">\n:[[File:Pictogram info.png|15px|text-top|link=]] '''Note''': Any changes made here will be automatically overwritten by a bot. Please ''do not'' make changes here as they will be lost. Edit '''[[:%dictionary%|the master page]]''' instead.\n:%missing%</div>[[Category:Template dictionary|%dictionaryname%/%keyname%]]</noinclude>"""
self.subpageTemplateParam = """{{#switch:{{{1|}}}|%options%}}<noinclude><hr style="margin: 1em 0em;" /><div style="font-size: 95%;">\n:[[File:Pictogram info.png|15px|text-top|link=]] '''Note''': Any changes made here will be automatically overwritten by a bot. Please ''do not'' make changes here as they will be lost. Edit '''[[:%dictionary%|the master page]]''' instead.</div>[[Category:Template dictionary|%dictionaryname%/%keyname%]]</noinclude>"""
self.invalidParamError = """<div style="font-size: 95%; color: #CC0000;">\n:[[File:Pictogram info.png|15px|text-top|link=]] '''Error''': Invalid parameter passed.</div>"""
self.subpageTemplateID = """%string%<noinclude><hr style="margin: 1em 0em;" /><div style="font-size: 95%;">\n:[[File:Pictogram info.png|15px|text-top|link=]] '''Note''': Any changes made here will be automatically overwritten by a bot. Please ''do not'' make changes here as they will be lost. Edit '''[[:%dictionary%|the master page]]''' instead.</div>[[Category:Template dictionary|%dictionaryname%/%keyname%]]</noinclude>"""
self.dictionaries = {
u'Template:Dictionary/items': { # Dictionary page
'name': 'items', # Dictionary name (used for categorizing)
'sync': 'Template:Dictionary/items/Special:SyncData' # Page holding last sync data
},
u'Template:Dictionary/common strings': { # Warning: no underscore
'name': 'common strings',
'sync': 'Template:Dictionary/common strings/Special:SyncData'
},
u'Template:Dictionary/price': {
'name': 'price',
'sync': 'Template:Dictionary/price/Special:SyncData'
},
u'Template:Dictionary/mechanics': {
'name': 'mechanics',
'sync': 'Template:Dictionary/mechanics/Special:SyncData'
}
}
self.subpageSeparator = u'/'
# List of supported languages, in prefered order
self.languages = [u'en', u'ar', u'cs', u'da', u'de', u'es', u'fi', u'fr', u'hu', u'it', u'ja', u'ko', u'nl', u'no', u'pl', u'pt', u'pt-br', u'ro', u'ru', u'sv', u'zh-hans', u'zh-hant']
self.defaultLang = u'en'
self.filterName = u'Your friendly neighborhood dictionary updater'
self.commentsExtract = compileRegex(r)
self.stringsExtract = compileRegex(r'(?:^[ \t]*#[ \t]*([^\r\n]*?)[ \t]*$\s*)?^[ \t]*([^\r\n]+?[ \t]*(?:\|[ \t]*[^\r\n]+?[ \t]*)*):[ \t]*([ \t]*[^\r\n]+?[ \t]*$|\s*[\r\n]+(?:\s*[-\w]+[ \t]*:[ \t]*[^\r\n]+[ \t]*$)+)', re.IGNORECASE | re.MULTILINE)
self.translationExtract = compileRegex(r'^[ \t]*([-\w]+)[ \t]*:[ \t]*([^\r\n]+)[ \t]*$', re.IGNORECASE | re.MULTILINE)
addWhitelistPage(self.dictionaries.keys())
def generateSubpage(self, keyName, data, currentDict, syncData):
h = hashlib.md5()
if type(data) is type({}): # Subkeys (translations or not)
isTranslation = True
subpage = u(self.subpageTemplateLang)
for k in data:
if k not in self.languages:
isTranslation = False
subpage = u(self.subpageTemplateParam)
break
ordered = []
if isTranslation:
missing = []
for lang in self.languages:
if lang in data:
ordered.append(lang + u'=' + data[lang])
h.update((lang + u'=' + data[lang]).encode('utf8'))
else:
missing.append(lang)
h.update((u'null-' + lang).encode('utf8'))
if self.defaultLang in data:
ordered.append(u'#default=' + data[self.defaultLang])
if len(missing):
subpage = subpage.replace(u'%missing%', u"'''Languages missing''': " + u', '.join(missing))
else:
subpage = subpage.replace(u'%missing%', u"'''Supported languages''': All")
else: # Not a translation
h.update('Any-')
subkeys = data.keys()
subkeys.sort()
for k in subkeys:
ordered.append(k + u'=' + data[k])
h.update((k + u'=' + data[k]).encode('utf8'))
#ordered.append(u'#default=' + u(self.invalidParamError))
subpage = subpage.replace(u'%options%', u'|'.join(ordered))
else: # No subkeys
data = u(data)
subpage = self.subpageTemplateID
h.update(u(u'ID-' + data).encode('utf8'))
subpage = subpage.replace(u'%string%', data)
h = u(h.hexdigest())
if keyName in syncData and syncData[keyName] == h:
return # Same hash
syncData[keyName] = h # Update sync data
subpage = subpage.replace(u'%dictionary%', currentDict)
subpage = subpage.replace(u'%dictionaryname%', self.dictionaries[currentDict]['name'])
subpage = subpage.replace(u'%keyname%', keyName)
editPage(currentDict + self.subpageSeparator + keyName, subpage, summary=u'Pushed changes from [[:' + currentDict + u']] for string "' + keyName + u'".', minor=True, nocreate=False)
def processComment(self, commentString, currentDict, definedStrings, syncData):
commentContents = []
for extractedStr in self.stringsExtract.finditer(commentString):
comment = u
if extractedStr.group(1):
comment = u'# ' + u(extractedStr.group(1)) + u'\n'
dataString = u(extractedStr.group(3))
if dataString.find(u'\r') == -1 and dataString.find(u'\n') == -1: # Assume no subkeys
data = dataString.strip()
dataWriteback = u' ' + data
else: # There's subkeys; detect whether this is a translation or not
data = {}
isTranslation = True
for translation in self.translationExtract.finditer(dataString.strip()):
data[u(translation.group(1))] = u(translation.group(2))
if u(translation.group(1)) not in self.languages:
isTranslation = False
ordered = []
if isTranslation:
for lang in self.languages:
if lang in data:
ordered.append(u' ' + lang + u': ' + data[lang])
else: # Not a translation, so order in alphabetical order
subkeys = data.keys()
subkeys.sort()
for subk in subkeys:
ordered.append(u' ' + subk + u': ' + data[subk])
dataWriteback = u'\n' + u'\n'.join(ordered)
keyNames = u(extractedStr.group(2)).lower().split(u'|')
validKeyNames = []
for keyName in keyNames:
keyName = keyName.replace(u'_', u' ').strip()
if keyName in definedStrings:
continue # Duplicate key
definedStrings.append(keyName)
validKeyNames.append(keyName)
self.generateSubpage(keyName, data, currentDict, syncData)
if len(validKeyNames):
commentContents.append(comment + u' | '.join(validKeyNames) + u':' + dataWriteback)
return u'\n\n'.join(commentContents)
def __call__(self, content, **kwargs):
if 'article' not in kwargs:
return content
if u(kwargs['article'].title) not in self.dictionaries:
return content
currentDict = u(kwargs['article'].title)
syncPage = page(self.dictionaries[currentDict]['sync'])
try:
syncDataText = u(syncPage.getWikiText()).split(u'\n')
except: # Page probably doesn't exist
syncDataText = u
syncData = {}
for sync in syncDataText:
sync = u(sync.strip())
if not sync:
continue
sync = sync.split(u':', 2)
if len(sync) == 2:
syncData[sync[0]] = sync[1]
oldSyncData = syncData.copy()
newContent = u
previousIndex = 0
definedStrings = []
for comment in self.commentsExtract.finditer(content):
newContent += content[previousIndex:comment.start()]
previousIndex = comment.end()
# Process current comment
newContent += u
newContent += content[previousIndex:]
# Check if we need to update sync data
needUpdate = False
for k in syncData:
if k not in oldSyncData or oldSyncData[k] != syncData[k]:
needUpdate = True
break
# Check for deleted strings
for k in oldSyncData:
if k not in definedStrings:
try:
deletePage(currentDict + self.subpageSeparator + k, 'Removed deleted string "' + k + u'" from ' + currentDict + u'.')
except:
pass
if k in syncData:
del syncData[k]
needUpdate = True
if needUpdate:
# Build syncdata string representation
syncKeys = syncData.keys()
syncKeys.sort()
syncLines = []
for k in syncKeys:
syncLines.append(k + u':' + syncData[k])
editPage(syncPage, u'\n'.join(syncLines), summary=u'Updated synchronization information for [[:' + currentDict + u']].', minor=True, nocreate=False)
return newContent
addFilter(DictionaryUpdater())
File filters
class imageCrushFilter:
def __init__(self):
self.minRatio = 10 # Compression ratio threshold
self.minByteDiff = 2048 # Byte difference threshold
self.jpgScanMap = u'0: 0 0 0 0 ;1 2: 0 0 0 0 ;0: 1 8 0 2 ;1: 1 8 0 0 ;2: 1 8 0 0 ;0: 9 63 0 2 ;0: 1 63 2 1 ;0: 1 63 1 0 ;1: 9 63 0 0 ;2: 9 63 0 0 ;'.replace(u';', u';\n')
self.filterName = 'Saved crush information'
self.extractHash = compileRegex(r'\{\{(?:png)?crush\s*\|\s*(\w+?)\s*\|\s*(\w+?)\s*}}')
try:
subprocess.call(['pngcrush', '-version'])
self.pngenabled = True
except:
print 'Warning: PNGCrush is not installed or not in $PATH'
self.pngenabled = False
try:
subprocess.call(['jpegtran', '-h'])
self.jpgenabled = True
except:
print 'Warning: jpegtran is not installed or not in $PATH'
self.jpgenabled = False
def getRandBits(self):
return random.getrandbits(128)
def getFileHash(self, filename):
h = hashlib.md5()
f = open(filename, 'rb')
for i in f.readlines():
h.update(i)
f.close()
return u(h.hexdigest())
def deleteFile(self, *fs):
for f in fs:
try:
os.remove(tempFile)
except:
pass
def __call__(self, content, article, **kwargs):
title = u(article.title).lower()
if title[-4:] == '.png':
isPNG = True
if not self.pngenabled:
return content
elif title[-5:] == '.jpeg' or title[-4:] == '.jpg':
isPNG = False
if not self.jpgenabled:
return content
else:
return content
try: # This is a high-risk filter, lots of I/O, so wrap it in a big try
filePage = wikitools.wikifile.File(wiki(), article.title)
hashes = [u, u]
hashResult = self.extractHash.search(content)
hashTemplate = None
if hashResult:
hashes = [u(hashResult.group(1)), u(hashResult.group(2))]
hashTemplate = u'{{crush|' + hashes[0] + u'|' + hashes[1] + u'}}'
tempFile = getTempFilename()
filePage.download(location=tempFile, urlQuery=u(self.getRandBits()))
oldHash = self.getFileHash(tempFile)
if oldHash in hashes:
return content # Already worked on that one
hashTemplate = u'{{crush|' + oldHash + u'|None}}'
tempOutput = getTempFilename()
if isPNG:
result = subprocess.call(['pngcrush', '-rem', 'gAMA', '-rem', 'cHRM', '-rem', 'iCCP', '-rem', 'sRGB', '-brute', tempFile, tempOutput])
else:
mapFile = getTempFilename()
mapFileHandle = open(mapFile, 'wb')
mapFileHandle.write(self.jpgScanMap.encode('ascii')) # Onoz ASCII
mapFileHandle.close()
result = subprocess.call(['jpegtran', '-o', '-scans', mapFile, '-copy', 'none', '-progressive', '-outfile', tempOutput, tempFile])
self.deleteFile(mapFile)
oldSize = os.path.getsize(tempFile)
newSize = os.path.getsize(tempOutput)
self.deleteFile(tempFile)
if not result and oldSize > newSize:
# Ready to upload... or are we?
ratio = int(round(100 * (1.0 - float(newSize) / float(oldSize))))
if ratio >= self.minRatio or oldSize - newSize >= self.minByteDiff:
newHash = self.getFileHash(tempOutput)
if newHash in hashes:
self.deleteFile(tempOutput)
return content # Already got that result, no need to reupload
hashTemplate = u'{{crush|' + oldHash + u'|' + newHash + u'}}'
uploadFile(tempOutput, u(article.title), u'Crushed version: ' + u(ratio) + u'% reduction / ' + u(oldSize - newSize) + u' bytes saved; from ' + u(oldSize) + u' to ' + u(newSize) + u' bytes.', overwrite=True, reupload=True)
hashes = [oldHash, newHash]
if hashResult:
content = content[:hashResult.start()] + hashTemplate + content[hashResult.end():]
else:
content = content.strip() + u'\n\n' + hashTemplate
self.deleteFile(tempOutput)
except:
pass # Well, that didn't work
return content
addFileFilter(imageCrushFilter())