Package translate :: Package storage :: Module html
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.html

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2004-2006 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21  # 
 22   
 23  """module for parsing html files for translation""" 
 24   
 25  import re 
 26  from translate.storage import base 
 27  from HTMLParser import HTMLParser 
 28   
29 -class htmlunit(base.TranslationUnit):
30 """A unit of translatable/localisable HTML content"""
31 - def __init__(self, source=None):
32 self.locations = [] 33 self.setsource(source)
34
35 - def getsource(self):
36 #TODO: Rethink how clever we should try to be with html entities. 37 return self.text.replace("&amp;", "&").replace("&lt;", "<").replace("\r\n", " ").replace("\n", " ").replace("\r", " ")
38
39 - def setsource(self, source):
40 self.text = source.replace("&", "&amp;").replace("<", "&lt;")
41 source = property(getsource, setsource) 42
43 - def addlocation(self, location):
44 self.locations.append(location)
45
46 - def getlocations(self):
47 return self.locations
48 49
50 -class htmlfile(HTMLParser, base.TranslationStore):
51 UnitClass = htmlunit 52 markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"] 53 markingattrs = [] 54 includeattrs = ["alt", "summary", "standby", "abbr", "content"] 55
56 - def __init__(self, includeuntaggeddata=None, inputfile=None):
57 self.units = [] 58 self.filename = getattr(inputfile, 'name', None) 59 self.currentblock = "" 60 self.currentblocknum = 0 61 self.currenttag = None 62 self.includeuntaggeddata = includeuntaggeddata 63 HTMLParser.__init__(self) 64 65 if inputfile is not None: 66 htmlsrc = inputfile.read() 67 inputfile.close() 68 self.parse(htmlsrc)
69
70 - def guess_encoding(self, htmlsrc):
71 """Returns the encoding of the html text. 72 73 We look for 'charset=' within a meta tag to do this. 74 """ 75 76 pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']''' 77 result = re.findall(pattern, htmlsrc) 78 encoding = None 79 if result: 80 encoding = result[0] 81 return encoding
82
83 - def do_encoding(self, htmlsrc):
84 """Return the html text properly encoded based on a charset.""" 85 charset = self.guess_encoding(htmlsrc) 86 if charset: 87 return htmlsrc.decode(charset) 88 else: 89 return htmlsrc
90
91 - def parse(self, htmlsrc):
92 htmlsrc = self.do_encoding(htmlsrc) 93 self.feed(htmlsrc)
94
95 - def addhtmlblock(self, text):
96 text = self.strip_html(text) 97 if self.has_translatable_content(text): 98 self.currentblocknum += 1 99 unit = self.addsourceunit(text) 100 unit.addlocation("%s:%d" % (self.filename, self.currentblocknum))
101
102 - def strip_html(self, text):
103 """Strip unnecessary html from the text. 104 105 HTML tags are deemed unnecessary if it fully encloses the translatable 106 text, eg. '<a href="index.html">Home Page</a>'. 107 108 HTML tags that occurs within the normal flow of text will not be removed, 109 eg. 'This is a link to the <a href="index.html">Home Page</a>.' 110 """ 111 text = text.strip() 112 113 pattern = '(?s)^<[^>]*>(.*)</.*>$' 114 result = re.findall(pattern, text) 115 if len(result) == 1: 116 text = self.strip_html(result[0]) 117 return text
118
119 - def has_translatable_content(self, text):
120 """Check if the supplied HTML snippet has any content that needs to be translated.""" 121 122 text = text.strip() 123 result = re.findall('(?i).*(charset.*=.*)', text) 124 if len(result) == 1: 125 return False 126 127 # TODO: Get a better way to find untranslatable entities. 128 if text == '&nbsp;': 129 return False 130 131 pattern = '<[^>]*>' 132 result = re.sub(pattern, '', text).strip() 133 if result: 134 return True 135 else: 136 return False
137 138 #From here on below, follows the methods of the HTMLParser 139
140 - def startblock(self, tag):
141 self.addhtmlblock(self.currentblock) 142 self.currentblock = "" 143 self.currenttag = tag
144
145 - def endblock(self):
146 self.addhtmlblock(self.currentblock) 147 self.currentblock = "" 148 self.currenttag = None
149
150 - def handle_starttag(self, tag, attrs):
151 newblock = 0 152 if tag in self.markingtags: 153 newblock = 1 154 for attrname, attrvalue in attrs: 155 if attrname in self.markingattrs: 156 newblock = 1 157 if attrname in self.includeattrs: 158 self.addhtmlblock(attrvalue) 159 160 if newblock: 161 self.startblock(tag) 162 elif self.currenttag is not None: 163 self.currentblock += self.get_starttag_text()
164
165 - def handle_startendtag(self, tag, attrs):
166 for attrname, attrvalue in attrs: 167 if attrname in self.includeattrs: 168 self.addhtmlblock(attrvalue) 169 if self.currenttag is not None: 170 self.currentblock += self.get_starttag_text()
171
172 - def handle_endtag(self, tag):
173 if tag == self.currenttag: 174 self.endblock() 175 elif self.currenttag is not None: 176 self.currentblock += '</%s>' % tag
177
178 - def handle_data(self, data):
179 if self.currenttag is not None: 180 self.currentblock += data 181 elif self.includeuntaggeddata: 182 self.startblock(None) 183 self.currentblock += data
184
185 - def handle_charref(self, name):
186 self.handle_data("&#%s;" % name)
187
188 - def handle_entityref(self, name):
189 self.handle_data("&%s;" % name)
190
191 - def handle_comment(self, data):
192 # we don't do anything with comments 193 pass
194
195 -class POHTMLParser(htmlfile):
196 pass
197