User:WindBOT/Filters: Difference between revisions

Jump to navigation Jump to search
(Created page with "{{User:WindBOT/Header}} == How to disable a filter == If the bot is malfunctioning, chances are that the problem lies in one of these blocks of code. Thus, instead of shutting d...")
 
No edit summary
Line 15: Line 15:
__TOC__
__TOC__


No filters yet!
== Page filters ==
  addPageFilter(r'^user:', r'(?:talk|help|wiki|template):')
 
== Semantic filters ==
None yet~
 
== Language-specific filters ==
None yet~
 
== Link filters ==
=== Wikipedia links filter ===
  def wikipediaLinks(link, **kwargs):
      wikipediaRegex = compileRegex(r'^https?://(?:(\w+)\.)?wikipedia\.org/wiki/(\S+)')
      if link.getType() == u'external':
          linkInfo = wikipediaRegex.search(link.getLink())
          if linkInfo:
              link.setType(u'internal')
              try:
                  wikiPage = urllib2.unquote(str(linkInfo.group(2))).decode('utf8', 'ignore').replace(u'_', ' ')
              except:
                  wikiPage = u(linkInfo.group(2)).replace(u'_', ' ')
              if not linkInfo.group(1) or linkInfo.group(1).lower() == u'en':
                  link.setLink(u'Wikipedia:' + wikiPage) # English Wikipedia
              else:
                  link.setLink(u'Wikipedia:' + linkInfo.group(1).lower() + u':' + wikiPage) # Non-english Wikipedia
              if link.getLabel() is None:
                  link.setLabel(u'(Wikipedia)')
      return link
  addLinkFilter(wikipediaLinks)
 
=== Remove trailing slashes from internal links ===
  def removeTrailingSlash(l, **kwargs):
      if l.getType() != u'internal':
          return l
      if l.getLink()[-1] == '/':
          l.setLink(l.getLink()[:-1])
      return l
  addLinkFilter(removeTrailingSlash)
 
== Template filters ==
=== Template renaming ===
  def templateRenameMapping(t, **kwargs):
      templateMap = {
          # Format goes like this (without the "#" in front obviously):
          #'Good template name': ['Bad template lowercase name 1', 'Bad template lowercase name 2', 'Bad template lowercase name 3'],
          # Last line has no comma at the end
          'Crush': ['pngcrush']
      }
      for n in templateMap:
          if t.getName().lower() in templateMap[n]:
              t.setName(n)
      return t
  addTemplateFilter(templateRenameMapping)
 
=== Remove useless templates ===
  def removeUselessTemplate(t, **kwargs):
      if t.getName().lower() in (u'targeted', u'languages'):
          return None # Delete template
      return t
  addTemplateFilter(removeUselessTemplate)
 
=== Filter parameters of certain templates ===
  def templateParamFilter(t, **kwargs):
      params = { # Map: 'lowercase template name': ['list', 'of', 'params', 'to', 'filter']
          'patch layout': ['before', 'after', 'current'],
          'item infobox': ['released']
      }
      if t.getName().lower() not in params:
          return t
      for p in params[t.getName().lower()]:
          if t.getParam(p):
              t.setParam(p, fixContent(t.getParam(p), **kwargs))
      return t
  addTemplateFilter(templateParamFilter)
 
=== Remove obsolete parameters ===
  def obsoleteParameterFilter(t, **kwargs):
      params = { # Map: 'lowercase template name': ['list', 'of', 'params', 'to', 'delete']
      }
      if t.getName().lower() not in params:
          return t
      for p in params[t.getName().lower()]:
          p = u(p)
          if p.find(u'#n') != -1:
              for i in range(10):
                  t.delParam(p.replace(u'#n', str(i)))
          else:
              t.delParam(p)
      return t
  addTemplateFilter(obsoleteParameterFilter)
 
=== Implement {{tl|Dictionary}} ===
  class DictionaryUpdater:
      def __init__(self):
          self.subpageTemplateLang = <nowiki>"""{{#switch:{{{lang|{{SUBPAGENAME}}}}}|%options%}}<noinclude><hr style="margin: 1em 0em;" /><div style="font-size: 95%;">\n:[[File:Pictogram info.png|15px|text-top|link=]]&nbsp;'''Note''': Any changes made here will be automatically overwritten by a bot. Please ''do not'' make changes here as they will be lost. Edit '''[[:%dictionary%|the master page]]''' instead.\n:%missing%</div>[[Category:Template dictionary|%dictionaryname%/%keyname%]]</noinclude>"""</nowiki>
          self.subpageTemplateParam = <nowiki>"""{{#switch:{{{1|}}}|%options%}}<noinclude><hr style="margin: 1em 0em;" /><div style="font-size: 95%;">\n:[[File:Pictogram info.png|15px|text-top|link=]]&nbsp;'''Note''': Any changes made here will be automatically overwritten by a bot. Please ''do not'' make changes here as they will be lost. Edit '''[[:%dictionary%|the master page]]''' instead.</div>[[Category:Template dictionary|%dictionaryname%/%keyname%]]</noinclude>"""</nowiki>
          self.invalidParamError = <nowiki>"""<div style="font-size: 95%; color: #CC0000;">\n:[[File:Pictogram info.png|15px|text-top|link=]]&nbsp;'''Error''': Invalid parameter passed.</div>"""</nowiki>
          self.subpageTemplateID = <nowiki>"""%string%<noinclude><hr style="margin: 1em 0em;" /><div style="font-size: 95%;">\n:[[File:Pictogram info.png|15px|text-top|link=]]&nbsp;'''Note''': Any changes made here will be automatically overwritten by a bot. Please ''do not'' make changes here as they will be lost. Edit '''[[:%dictionary%|the master page]]''' instead.</div>[[Category:Template dictionary|%dictionaryname%/%keyname%]]</noinclude>"""</nowiki>
          self.dictionaries = {
              u'Template:Dictionary/items': { # Dictionary page
                  'name': 'items', # Dictionary name (used for categorizing)
                  'sync': 'Template:Dictionary/items/Special:SyncData' # Page holding last sync data
              },
              u'Template:Dictionary/common strings': { # Warning: no underscore
                  'name': 'common strings',
                  'sync': 'Template:Dictionary/common strings/Special:SyncData'
              },
              u'Template:Dictionary/price': {
                  'name': 'price',
                  'sync': 'Template:Dictionary/price/Special:SyncData'
              }
          }
          self.subpageSeparator = u'/'
          # List of supported languages, in prefered order
          self.languages = [u'en', u'ar', u'cs', u'da', u'de', u'es', u'fi', u'fr', u'hu', u'it', u'ja', u'ko', u'nl', u'no', u'pl', u'pt', u'pt-br', u'ro', u'ru', u'sv', u'zh-hans', u'zh-hant']
          self.defaultLang = u'en'
          self.filterName = u'Your friendly neighborhood dictionary updater'
          self.commentsExtract = compileRegex(r'<!--([\S\s]+?)-->')
          self.stringsExtract = compileRegex(r'(?:^[ \t]*#[ \t]*([^\r\n]*?)[ \t]*$\s*)?^[ \t]*([^\r\n]+?[ \t]*(?:\|[ \t]*[^\r\n]+?[ \t]*)*):[ \t]*([ \t]*[^\r\n]+?[ \t]*$|\s*[\r\n]+(?:\s*[-\w]+[ \t]*:[ \t]*[^\r\n]+[ \t]*$)+)', re.IGNORECASE | re.MULTILINE)
          self.translationExtract = compileRegex(r'^[ \t]*([-\w]+)[ \t]*:[ \t]*([^\r\n]+)[ \t]*$', re.IGNORECASE | re.MULTILINE)
          addWhitelistPage(self.dictionaries.keys())
      def generateSubpage(self, keyName, data, currentDict, syncData):
          h = hashlib.md5()
          if type(data) is type({}): # Subkeys (translations or not)
              isTranslation = True
              subpage = u(self.subpageTemplateLang)
              for k in data:
                  if k not in self.languages:
                      isTranslation = False
                      subpage = u(self.subpageTemplateParam)
                      break
              ordered = []
              if isTranslation:
                  missing = []
                  for lang in self.languages:
                      if lang in data:
                          ordered.append(lang + u'=' + data[lang])
                          h.update((lang + u'=' + data[lang]).encode('utf8'))
                      else:
                          missing.append(lang)
                          h.update((u'null-' + lang).encode('utf8'))
                  if self.defaultLang in data:
                      ordered.append(u'#default=' + data[self.defaultLang])
                  if len(missing):
                      subpage = subpage.replace(u'%missing%', <nowiki>u"'''Languages missing''': "</nowiki> + u', '.join(missing))
                  else:
                      subpage = subpage.replace(u'%missing%', <nowiki>u"'''Supported languages''': All"</nowiki>)
              else: # Not a translation
                  h.update('Any-')
                  subkeys = data.keys()
                  subkeys.sort()
                  for k in subkeys:
                      ordered.append(k + u'=' + data[k])
                      h.update((k + u'=' + data[k]).encode('utf8'))
                  #ordered.append(u'#default=' + u(self.invalidParamError))
              subpage = subpage.replace(u'%options%', u'|'.join(ordered))
          else: # No subkeys
              data = u(data)
              subpage = self.subpageTemplateID
              h.update(u(u'ID-' + data).encode('utf8'))
              subpage = subpage.replace(u'%string%', data)
          h = u(h.hexdigest())
          if keyName in syncData and syncData[keyName] == h:
              return # Same hash
          syncData[keyName] = h # Update sync data
          subpage = subpage.replace(u'%dictionary%', currentDict)
          subpage = subpage.replace(u'%dictionaryname%', self.dictionaries[currentDict]['name'])
          subpage = subpage.replace(u'%keyname%', keyName)
          editPage(currentDict + self.subpageSeparator + keyName, subpage, summary=<nowiki>u'Pushed changes from [[:' + currentDict + u']] for string "' + keyName + u'".'</nowiki>, minor=True, nocreate=False)
      def processComment(self, commentString, currentDict, definedStrings, syncData):
          commentContents = []
          for extractedStr in self.stringsExtract.finditer(commentString):
              comment = u''
              if extractedStr.group(1):
                  comment = u'# ' + u(extractedStr.group(1)) + u'\n'
              dataString = u(extractedStr.group(3))
              if dataString.find(u'\r') == -1 and dataString.find(u'\n') == -1: # Assume no subkeys
                  data = dataString.strip()
                  dataWriteback = u' ' + data
              else: # There's subkeys; detect whether this is a translation or not
                  data = {}
                  isTranslation = True
                  for translation in self.translationExtract.finditer(dataString.strip()):
                      data[u(translation.group(1))] = u(translation.group(2))
                      if u(translation.group(1)) not in self.languages:
                          isTranslation = False
                  ordered = []
                  if isTranslation:
                      for lang in self.languages:
                          if lang in data:
                              ordered.append(u'  ' + lang + u': ' + data[lang])
                  else: # Not a translation, so order in alphabetical order
                      subkeys = data.keys()
                      subkeys.sort()
                      for subk in subkeys:
                          ordered.append(u'  ' + subk + u': ' + data[subk])
                  dataWriteback = u'\n' + u'\n'.join(ordered)
              keyNames = u(extractedStr.group(2)).lower().split(u'|')
              validKeyNames = []
              for keyName in keyNames:
                  keyName = keyName.replace(u'_', u' ').strip()
                  if keyName in definedStrings:
                      continue # Duplicate key
                  definedStrings.append(keyName)
                  validKeyNames.append(keyName)
                  self.generateSubpage(keyName, data, currentDict, syncData)
              if len(validKeyNames):
                  commentContents.append(comment + u' | '.join(validKeyNames) + u':' + dataWriteback)
          return u'\n\n'.join(commentContents)
      def __call__(self, content, **kwargs):
          if 'article' not in kwargs:
              return content
          if u(kwargs['article'].title) not in self.dictionaries:
              return content
          currentDict = u(kwargs['article'].title)
          syncPage = page(self.dictionaries[currentDict]['sync'])
          try:
              syncDataText = u(syncPage.getWikiText()).split(u'\n')
          except: # Page probably doesn't exist
              syncDataText = u''
          syncData = {}
          for sync in syncDataText:
              sync = u(sync.strip())
              if not sync:
                  continue
              sync = sync.split(u':', 2)
              if len(sync) == 2:
                  syncData[sync[0]] = sync[1]
          oldSyncData = syncData.copy()
          newContent = u''
          previousIndex = 0
          definedStrings = []
          for comment in self.commentsExtract.finditer(content):
              newContent += content[previousIndex:comment.start()]
              previousIndex = comment.end()
              # Process current comment
              newContent += u'<!--\n\n' + self.processComment(u(comment.group(1)).strip(), currentDict, definedStrings, syncData) + u'\n\n-->'
          newContent += content[previousIndex:]
          # Check if we need to update sync data
          needUpdate = False
          for k in syncData:
              if k not in oldSyncData or oldSyncData[k] != syncData[k]:
                  needUpdate = True
                  break
          # Check for deleted strings
          for k in oldSyncData:
              if k not in definedStrings:
                  try:
                      deletePage(currentDict + self.subpageSeparator + k, 'Removed deleted string "' + k + u'" from [[:' + currentDict + u']].')
                  except:
                      pass
                  if k in syncData:
                      del syncData[k]
                  needUpdate = True
          if needUpdate:
              # Build syncdata string representation
              syncKeys = syncData.keys()
              syncKeys.sort()
              syncLines = []
              for k in syncKeys:
                  syncLines.append(k + u':' + syncData[k])
              editPage(syncPage, u'\n'.join(syncLines), summary=<nowiki>u'Updated synchronization information for [[:' + currentDict + u']].'</nowiki>, minor=True, nocreate=False)
          return newContent
  addFilter(DictionaryUpdater())
 
== File filters ==
=== [http://en.wikipedia.org/wiki/Pngcrush PNGCrush]/[http://jpegclub.org/ jpegtran] all PNG/JPG images ===
  class imageCrushFilter:
      def __init__(self):
          self.minRatio = 10 # Compression ratio threshold
          self.minByteDiff = 2048 # Byte difference threshold
          self.jpgScanMap = u'0:  0  0 0 0 ;1 2: 0  0 0 0 ;0:  1  8 0 2 ;1:  1  8 0 0 ;2:  1  8 0 0 ;0:  9 63 0 2 ;0:  1 63 2 1 ;0:  1 63 1 0 ;1:  9 63 0 0 ;2:  9 63 0 0 ;'.replace(u';', u';\n')
          self.filterName = 'Saved crush information'
          self.extractHash = compileRegex(r'\{\{(?:png)?crush\s*\|\s*(\w+?)\s*\|\s*(\w+?)\s*}}')
          try:
              subprocess.call(['pngcrush', '-version'])
              self.pngenabled = True
          except:
              print 'Warning: PNGCrush is not installed or not in $PATH'
              self.pngenabled = False
          try:
              subprocess.call(['jpegtran', '-h'])
              self.jpgenabled = True
          except:
              print 'Warning: jpegtran is not installed or not in $PATH'
              self.jpgenabled = False
      def getRandBits(self):
          return random.getrandbits(128)
      def getFileHash(self, filename):
          h = hashlib.md5()
          f = open(filename, 'rb')
          for i in f.readlines():
              h.update(i)
          f.close()
          return u(h.hexdigest())
      def deleteFile(self, *fs):
          for f in fs:
              try:
                  os.remove(tempFile)
              except:
                  pass
      def __call__(self, content, article, **kwargs):
          title = u(article.title).lower()
          if title[-4:] == '.png':
              isPNG = True
              if not self.pngenabled:
                  return content
          elif title[-5:] == '.jpeg' or title[-4:] == '.jpg':
              isPNG = False
              if not self.jpgenabled:
                  return content
          else:
              return content
          try: # This is a high-risk filter, lots of I/O, so wrap it in a big try
              filePage = wikitools.wikifile.File(wiki(), article.title)
              hashes = [u'', u'']
              hashResult = self.extractHash.search(content)
              hashTemplate = None
              if hashResult:
                  hashes = [u(hashResult.group(1)), u(hashResult.group(2))]
                  hashTemplate = <nowiki>u'{{crush|' + hashes[0] + u'|' + hashes[1] + u'}}'</nowiki>
              tempFile = getTempFilename()
              filePage.download(location=tempFile, urlQuery=u(self.getRandBits()))
              oldHash = self.getFileHash(tempFile)
              if oldHash in hashes:
                  return content # Already worked on that one
              hashTemplate = <nowiki>u'{{crush|' + oldHash + u'|None}}'</nowiki>
              tempOutput = getTempFilename()
              if isPNG:
                  result = subprocess.call(['pngcrush', '-rem', 'gAMA', '-rem', 'cHRM', '-rem', 'iCCP', '-rem', 'sRGB', '-brute', tempFile, tempOutput])
              else:
                  mapFile = getTempFilename()
                  mapFileHandle = open(mapFile, 'wb')
                  mapFileHandle.write(self.jpgScanMap.encode('ascii')) # Onoz ASCII
                  mapFileHandle.close()
                  result = subprocess.call(['jpegtran', '-o', '-scans', mapFile, '-copy', 'none', '-progressive', '-outfile', tempOutput, tempFile])
                  self.deleteFile(mapFile)
              oldSize = os.path.getsize(tempFile)
              newSize = os.path.getsize(tempOutput)
              self.deleteFile(tempFile)
              if not result and oldSize > newSize:
                  # Ready to upload... or are we?
                  ratio = int(round(100 * (1.0 - float(newSize) / float(oldSize))))
                  if ratio >= self.minRatio or oldSize - newSize >= self.minByteDiff:
                      newHash = self.getFileHash(tempOutput)
                      if newHash in hashes:
                          self.deleteFile(tempOutput)
                          return content # Already got that result, no need to reupload
                      hashTemplate = <nowiki>u'{{crush|' + oldHash + u'|' + newHash + u'}}'</nowiki>
                      uploadFile(tempOutput, u(article.title), u'Crushed version: ' + u(ratio) + u'% reduction / ' + u(oldSize - newSize) + u' bytes saved; from ' + u(oldSize) + u' to ' + u(newSize) + u' bytes.', overwrite=True, reupload=True)
                      hashes = [oldHash, newHash]
              if hashResult:
                  content = content[:hashResult.start()] + hashTemplate + content[hashResult.end():]
              else:
                  content = content.strip() + u'\n\n' + hashTemplate
              self.deleteFile(tempOutput)
          except:
              pass # Well, that didn't work
          return content
  addFileFilter(imageCrushFilter())