Package translate :: Package tools :: Module poterminology
[hide private]
[frames] | no frames]

Source Code for Module translate.tools.poterminology

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # This file is part of translate. 
  5  # 
  6  # translate is free software; you can redistribute it and/or modify 
  7  # it under the terms of the GNU General Public License as published by 
  8  # the Free Software Foundation; either version 2 of the License, or 
  9  # (at your option) any later version. 
 10  #  
 11  # translate is distributed in the hope that it will be useful, 
 12  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 14  # GNU General Public License for more details. 
 15  # 
 16  # You should have received a copy of the GNU General Public License 
 17  # along with translate; if not, write to the Free Software 
 18  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 19   
 20  """reads a set of .po or .pot files to produce a pootle-terminology.pot""" 
 21   
 22  from translate.storage import factory 
 23  from translate.lang import factory as lang_factory 
 24  from translate.storage import po 
 25  from translate.misc import optrecurse 
 26  import sys 
 27  import os 
 28  import re 
 29   
30 -class TerminologyOptionParser(optrecurse.RecursiveOptionParser):
31 """a specialized Option Parser for the terminology tool...""" 32 33 # handles c-format and python-format 34 formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]") 35 # handles XML/HTML 36 xmlpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>") 37 38 sortorders = [ "frequency", "dictionary", "length" ] 39 40 files = 0 41 units = 0 42
43 - def parse_args(self, args=None, values=None):
44 """parses the command line options, handling implicit input/output args""" 45 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values) 46 # some intelligence as to what reasonable people might give on the command line 47 if args and not options.input: 48 if not options.output and len(args) > 1: 49 options.input = args[:-1] 50 args = args[-1:] 51 else: 52 options.input = args 53 args = [] 54 if args and not options.output: 55 options.output = args[-1] 56 args = args[:-1] 57 if not options.output: 58 options.output = "pootle-terminology.pot" 59 if args: 60 self.error("You have used an invalid combination of --input, --output and freestanding args") 61 if isinstance(options.input, list) and len(options.input) == 1: 62 options.input = options.input[0] 63 return (options, args)
64
65 - def set_usage(self, usage=None):
66 """sets the usage string - if usage not given, uses getusagestring for each option""" 67 if usage is None: 68 self.usage = "%prog " + " ".join([self.getusagestring(option) for option in self.option_list]) + \ 69 "\n input directory is searched for PO files, terminology PO file is output file" 70 else: 71 super(TerminologyOptionParser, self).set_usage(usage)
72
73 - def run(self):
74 """parses the arguments, and runs recursiveprocess with the resulting options""" 75 (options, args) = self.parse_args() 76 options.inputformats = self.inputformats 77 options.outputoptions = self.outputoptions 78 self.usepsyco(options) 79 self.recursiveprocess(options)
80
81 - def recursiveprocess(self, options):
82 """recurse through directories and process files""" 83 if self.isrecursive(options.input, 'input') and getattr(options, "allowrecursiveinput", True): 84 if isinstance(options.input, list): 85 inputfiles = self.recurseinputfilelist(options) 86 else: 87 inputfiles = self.recurseinputfiles(options) 88 else: 89 if options.input: 90 inputfiles = [os.path.basename(options.input)] 91 options.input = os.path.dirname(options.input) 92 else: 93 inputfiles = [options.input] 94 if os.path.isdir(options.output): 95 options.output = os.path.join(options.output,"pootle-terminology.pot") 96 self.stopwords = {} 97 self.stoprelist = [] 98 actions = { '+': frozenset(), ':': frozenset(['skip']), 99 '<': frozenset(['phrase']), '=': frozenset(['word']), 100 '>': frozenset(['word','skip']), 101 '@': frozenset(['word','phrase']) } 102 if options.stopwordfile != None: 103 stopfile = open(options.stopwordfile, "r") 104 try: 105 for stopline in stopfile: 106 stoptype = stopline[0] 107 if stoptype == '#' or stoptype == "\n": 108 continue 109 elif stoptype == '/': 110 self.stoprelist.append(re.compile(stopline[1:-1]+'$')) 111 else: 112 self.stopwords[stopline[1:-1]] = actions[stoptype] 113 except KeyError, character: 114 self.warning("Bad line in stopword list %s starts with" % (options.stopwordfile), options, sys.exc_info()) 115 stopfile.close() 116 self.glossary = {} 117 self.initprogressbar(inputfiles, options) 118 for inputpath in inputfiles: 119 self.files += 1 120 fullinputpath = self.getfullinputpath(options, inputpath) 121 try: 122 success = self.processfile(None, options, fullinputpath) 123 except Exception, error: 124 if isinstance(error, KeyboardInterrupt): 125 raise 126 self.warning("Error processing: input %s" % (fullinputpath), options, sys.exc_info()) 127 success = False 128 self.reportprogress(inputpath, success) 129 del self.progressbar 130 self.outputterminology(options)
131
132 - def clean(self, string, options):
133 """returns the cleaned string that contains the text to be matched""" 134 for accelerator in options.accelchars: 135 string = string.replace(accelerator, "") 136 string = self.formatpat.sub(" ", string) 137 string = self.xmlpat.sub(" ", string) 138 string = string.strip() 139 return string
140
141 - def addphrases(self, words, skips, translation, partials=True):
142 """adds (sub)phrases with non-skipwords and more than one word""" 143 if (len(words) > skips + 1 and 144 'skip' not in self.stopwords.get(words[0], frozenset()) and 145 'skip' not in self.stopwords.get(words[-1], frozenset())): 146 self.glossary.setdefault(' '.join(words), []).append(translation) 147 if partials: 148 part = list(words) 149 while len(part) > 2: 150 if 'skip' in self.stopwords.get(part.pop(), frozenset()): 151 skips -= 1 152 if (len(part) > skips + 1 and 153 'skip' not in self.stopwords.get(part[0], frozenset()) and 154 'skip' not in self.stopwords.get(part[-1], frozenset())): 155 self.glossary.setdefault(' '.join(part), []).append(translation)
156 #XXX print ' '.join(part) + "\n" 157
158 - def processfile(self, fileprocessor, options, fullinputpath):
159 """process an individual file""" 160 inputfile = self.openinputfile(options, fullinputpath) 161 inputfile = factory.getobject(inputfile) 162 sourcelang = lang_factory.getlanguage(options.sourcelanguage) 163 rematchignore = frozenset(('word','phrase')) 164 defaultignore = frozenset() 165 for unit in inputfile.units: 166 self.units += 1 167 if unit.isheader() or not unit.istranslated(): 168 continue 169 if unit.hasplural(): 170 continue 171 if not options.invert: 172 source = self.clean(unit.source, options) 173 target = self.clean(unit.target, options) 174 else: 175 target = self.clean(unit.source, options) 176 source = self.clean(unit.target, options) 177 if len(source) <= 1: 178 continue 179 for sentence in sourcelang.sentences(source): 180 words = [] 181 skips = 0 182 for word in sourcelang.words(sentence): 183 if options.ignorecase or (options.foldtitle and word.istitle()): 184 word = word.lower() 185 ignore = defaultignore 186 if word in self.stopwords: 187 ignore = self.stopwords[word] 188 else: 189 for stopre in self.stoprelist: 190 if stopre.match(word) != None: 191 ignore = rematchignore 192 break 193 translation = (source, target, unit, fullinputpath) 194 if 'word' not in ignore: 195 # reduce plurals 196 root = word 197 if len(word) > 3 and word[-1] == 's' and word[0:-1] in self.glossary: 198 root = word[0:-1] 199 elif len(root) > 2 and root + 's' in self.glossary: 200 self.glossary[root] = self.glossary.pop(root + 's') 201 self.glossary.setdefault(root, []).append(translation) 202 if 'phrase' in ignore: 203 # add trailing phrases in previous words 204 while len(words) > 2: 205 if 'skip' in self.stopwords.get(words.pop(0),defaultignore): 206 skips -= 1 207 self.addphrases(words, skips, translation) 208 words = [] 209 skips = 0 210 else: 211 words.append(word) 212 if 'skip' in ignore: 213 skips += 1 214 if len(words) > options.termlength + skips: 215 while len(words) > options.termlength + skips: 216 if 'skip' in self.stopwords.get(words.pop(0),defaultignore): 217 skips -= 1 218 self.addphrases(words, skips, translation) 219 else: 220 self.addphrases(words, skips, translation, partials=False) 221 # add trailing phrases in sentence after reaching end 222 while len(words) > 2: 223 if 'skip' in self.stopwords.get(words.pop(0),defaultignore): 224 skips -= 1 225 self.addphrases(words, skips, translation)
226
227 - def outputterminology(self, options):
228 """saves the generated terminology glossary""" 229 termfile = po.pofile() 230 terms = {} 231 locre = re.compile(r":[0-9]+$") 232 print "%d terms from %d units in %d files" % (len(self.glossary), self.units, self.files) 233 for term, translations in self.glossary.iteritems(): 234 if len(translations) <= 1: 235 continue 236 filecounts = {} 237 sources = {} 238 termunit = po.pounit(term) 239 locations = {} 240 sourcenotes = {} 241 transnotes = {} 242 targets = {} 243 fullmsg = False 244 for source, target, unit, filename in translations: 245 sources[source] = 1 246 filecounts[filename] = filecounts.setdefault(filename, 0) + 1 247 if term.lower() == self.clean(unit.source, options).lower(): 248 fullmsg = True 249 target = self.clean(unit.target, options) 250 if options.ignorecase or (options.foldtitle and target.istitle()): 251 target = target.lower() 252 unit.settarget(target) 253 if target != "": 254 targets.setdefault(target, []).append(filename) 255 if term.lower() == unit.source.strip().lower(): 256 sourcenotes[unit.getnotes("source code")] = None; 257 transnotes[unit.getnotes("translator")] = None; 258 else: 259 unit.settarget("") 260 unit.setsource(term) 261 termunit.merge(unit, overwrite=False, comments=False) 262 for loc in unit.getlocations(): 263 locations.setdefault(locre.sub("", loc)) 264 numsources = len(sources) 265 numfiles = len(filecounts) 266 numlocs = len(locations) 267 if numfiles < options.inputmin or numlocs < options.locmin: 268 continue 269 if fullmsg: 270 if numsources < options.fullmsgmin: 271 continue 272 elif numsources < options.substrmin: 273 continue 274 if len(targets.keys()) > 1: 275 txt = '; '.join(["%s {%s}" % (target, ', '.join(files)) 276 for target, files in targets.iteritems()]) 277 if termunit.gettarget().find('};') < 0: 278 termunit.settarget(txt) 279 termunit.markfuzzy() 280 else: 281 # if annotated multiple terms already present, keep as-is 282 termunit.addnote(txt, "translator") 283 for location in locations.keys(): 284 termunit.addlocation(location) 285 for sourcenote in sourcenotes.keys(): 286 termunit.addnote(sourcenote, "source code") 287 for transnote in transnotes.keys(): 288 termunit.addnote(transnote, "translator") 289 for file, count in filecounts.iteritems(): 290 termunit.othercomments.append("# (poterminology) %s (%d)\n" % (file, count)) 291 terms[term] = (((10 * numfiles) + numsources, termunit)) 292 # reduce subphrase 293 termlist = terms.keys() 294 print "%d terms after thresholding" % len(termlist) 295 termlist.sort(lambda x, y: cmp(len(x),len(y))) 296 for term in termlist: 297 words = term.split() 298 if len(words) <= 2: 299 continue 300 while len(words) > 2: 301 words.pop() 302 if terms[term][0] == terms.get(' '.join(words),[0])[0]: 303 del terms[' '.join(words)] 304 words = term.split() 305 while len(words) > 2: 306 words.pop(0) 307 if terms[term][0] == terms.get(' '.join(words),[0])[0]: 308 del terms[' '.join(words)] 309 print "%d terms after subphrase reduction" % len(terms.keys()) 310 termitems = terms.values() 311 if options.sortorders == None: 312 options.sortorders = self.sortorders 313 while len(options.sortorders) > 0: 314 order = options.sortorders.pop() 315 if order == "frequency": 316 termitems.sort(lambda x, y: cmp(y[0],x[0])) 317 elif order == "dictionary": 318 termitems.sort(lambda x, y: cmp(x[1].source.lower(), y[1].source.lower())) 319 elif order == "length": 320 termitems.sort(lambda x, y: cmp(len(x[1].source), len(y[1].source))) 321 else: 322 self.warning("unknown sort order %s" % order, options) 323 for count, unit in termitems: 324 termfile.units.append(unit) 325 open(options.output, "w").write(str(termfile))
326
327 -def main():
328 formats = {"po":("po", None), None:("po", None)} 329 parser = TerminologyOptionParser(formats) 330 parser.add_option("-I", "--ignore-case", dest="ignorecase", 331 action="store_true", default=False, help="make all terms lowercase") 332 parser.add_option("-F", "--fold-titlecase", dest="foldtitle", 333 action="store_true", default=False, help="fold \"Title Case\" to lowercase") 334 parser.add_option("", "--accelerator", dest="accelchars", default="", 335 metavar="ACCELERATORS", help="ignores the given accelerator characters when matching") 336 parser.add_option("-t", "--term-words", type="int", dest="termlength", default="3", 337 help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH") 338 parser.add_option("", "--inputs-needed", type="int", dest="inputmin", default="2", 339 help="omit terms appearing in less than MIN input files (default 2)", metavar="MIN") 340 parser.add_option("", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1", 341 help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN") 342 parser.add_option("", "--substr-needed", type="int", dest="substrmin", default="2", 343 help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN") 344 parser.add_option("", "--locs-needed", type="int", dest="locmin", default="2", 345 help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN") 346 parser.add_option("", "--sort", dest="sortorders", action="append", 347 type="choice", choices=parser.sortorders, metavar="ORDER", 348 help="output sort order(s): %s (default is all orders in the above priority)" % ', '.join(parser.sortorders)) 349 parser.add_option("-S", "--stopword-list", type="string", dest="stopwordfile", 350 help="name of file containing stopword list", metavar="FILENAME") 351 parser.add_option("", "--source-language", dest="sourcelanguage", default="en", 352 help="the source language code (default 'en')", metavar="LANG") 353 parser.add_option("-v", "--invert", dest="invert", 354 action="store_true", default=False, help="invert the source and target languages for terminology") 355 parser.set_usage() 356 parser.description = __doc__ 357 parser.run()
358 359 360 if __name__ == '__main__': 361 main() 362