Package translate :: Package misc :: Module csv
[hide private]
[frames] | no frames]

Source Code for Module translate.misc.csv

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  csv.py - read/write/investigate CSV files 
  5  """ 
  6   
  7  import re 
  8  from _csv import Error, __version__, writer, reader, register_dialect, \ 
  9                   unregister_dialect, get_dialect, list_dialects, \ 
 10                   QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ 
 11                   __doc__ 
 12   
 13  try: 
 14      from cStringIO import StringIO 
 15  except ImportError: 
 16      from StringIO import StringIO 
 17   
 18  __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", 
 19              "Error", "Dialect", "excel", "excel_tab", "reader", "writer", 
 20              "register_dialect", "get_dialect", "list_dialects", "Sniffer", 
 21              "unregister_dialect", "__version__", "DictReader", "DictWriter" ] 
 22   
23 -class Dialect:
24 _name = "" 25 _valid = False 26 # placeholders 27 delimiter = None 28 quotechar = None 29 escapechar = None 30 doublequote = None 31 skipinitialspace = None 32 lineterminator = None 33 quoting = None 34
35 - def __init__(self):
36 if self.__class__ != Dialect: 37 self._valid = True 38 errors = self._validate() 39 if errors != []: 40 raise Error, "Dialect did not validate: %s" % ", ".join(errors)
41
42 - def _validate(self):
43 errors = [] 44 if not self._valid: 45 errors.append("can't directly instantiate Dialect class") 46 47 if self.delimiter is None: 48 errors.append("delimiter character not set") 49 elif (not isinstance(self.delimiter, str) or 50 len(self.delimiter) > 1): 51 errors.append("delimiter must be one-character string") 52 53 if self.quotechar is None: 54 if self.quoting != QUOTE_NONE: 55 errors.append("quotechar not set") 56 elif (not isinstance(self.quotechar, str) or 57 len(self.quotechar) > 1): 58 errors.append("quotechar must be one-character string") 59 60 if self.lineterminator is None: 61 errors.append("lineterminator not set") 62 elif not isinstance(self.lineterminator, str): 63 errors.append("lineterminator must be a string") 64 65 if self.doublequote not in (True, False): 66 errors.append("doublequote parameter must be True or False") 67 68 if self.skipinitialspace not in (True, False): 69 errors.append("skipinitialspace parameter must be True or False") 70 71 if self.quoting is None: 72 errors.append("quoting parameter not set") 73 74 if self.quoting is QUOTE_NONE: 75 if (not isinstance(self.escapechar, (unicode, str)) or 76 len(self.escapechar) > 1): 77 errors.append("escapechar must be a one-character string or unicode object") 78 79 return errors
80
81 -class excel(Dialect):
82 delimiter = ',' 83 quotechar = '"' 84 doublequote = True 85 skipinitialspace = False 86 lineterminator = '\r\n' 87 quoting = QUOTE_MINIMAL
88 register_dialect("excel", excel) 89
90 -class excel_tab(excel):
91 delimiter = '\t'
92 register_dialect("excel-tab", excel_tab) 93 94
95 -class DictReader:
96 - def __init__(self, f, fieldnames, restkey=None, restval=None, 97 dialect="excel", *args, **kwds):
98 self.fieldnames = fieldnames # list of keys for the dict 99 self.restkey = restkey # key to catch long rows 100 self.restval = restval # default value for short rows 101 self.reader = reader(f, dialect, *args, **kwds)
102
103 - def __iter__(self):
104 return self
105
106 - def next(self):
107 row = self.reader.next() 108 # unlike the basic reader, we prefer not to return blanks, 109 # because we will typically wind up with a dict full of None 110 # values 111 while row == []: 112 row = self.reader.next() 113 d = dict(zip(self.fieldnames, row)) 114 lf = len(self.fieldnames) 115 lr = len(row) 116 if lf < lr: 117 d[self.restkey] = row[lf:] 118 elif lf > lr: 119 for key in self.fieldnames[lr:]: 120 d[key] = self.restval 121 return d
122 123
124 -class DictWriter:
125 - def __init__(self, f, fieldnames, restval="", extrasaction="raise", 126 dialect="excel", *args, **kwds):
127 self.fieldnames = fieldnames # list of keys for the dict 128 self.restval = restval # for writing short dicts 129 if extrasaction.lower() not in ("raise", "ignore"): 130 raise ValueError, \ 131 ("extrasaction (%s) must be 'raise' or 'ignore'" % 132 extrasaction) 133 self.extrasaction = extrasaction 134 self.writer = writer(f, dialect, *args, **kwds)
135
136 - def _dict_to_list(self, rowdict):
137 if self.extrasaction == "raise": 138 for k in rowdict.keys(): 139 if k not in self.fieldnames: 140 raise ValueError, "dict contains fields not in fieldnames" 141 return [rowdict.get(key, self.restval) for key in self.fieldnames]
142
143 - def writerow(self, rowdict):
144 return self.writer.writerow(self._dict_to_list(rowdict))
145
146 - def writerows(self, rowdicts):
147 rows = [] 148 for rowdict in rowdicts: 149 rows.append(self._dict_to_list(rowdict)) 150 return self.writer.writerows(rows)
151 152 # Guard Sniffer's type checking against builds that exclude complex() 153 try: 154 complex 155 except NameError: 156 complex = float 157
158 -class Sniffer:
159 ''' 160 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) 161 Returns a Dialect object. 162 '''
163 - def __init__(self):
164 # in case there is more than one possible delimiter 165 self.preferred = [',', '\t', ';', ' ', ':']
166 167
168 - def sniff(self, sample, delimiters=None):
169 """ 170 Returns a dialect (or None) corresponding to the sample 171 """ 172 173 quotechar, delimiter, skipinitialspace = \ 174 self._guess_quote_and_delimiter(sample, delimiters) 175 if delimiter is None: 176 delimiter, skipinitialspace = self._guess_delimiter(sample, 177 delimiters) 178 179 class dialect(Dialect): 180 _name = "sniffed" 181 lineterminator = '\r\n' 182 quoting = QUOTE_MINIMAL 183 # escapechar = '' 184 doublequote = False
185 186 dialect.delimiter = delimiter 187 # _csv.reader won't accept a quotechar of '' 188 dialect.quotechar = quotechar or '"' 189 dialect.skipinitialspace = skipinitialspace 190 191 return dialect
192 193
194 - def _guess_quote_and_delimiter(self, data, delimiters):
195 """ 196 Looks for text enclosed between two identical quotes 197 (the probable quotechar) which are preceded and followed 198 by the same character (the probable delimiter). 199 For example: 200 ,'some text', 201 The quote with the most wins, same with the delimiter. 202 If there is no quotechar the delimiter can't be determined 203 this way. 204 """ 205 206 matches = [] 207 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", 208 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?", 209 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?" 210 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) 211 regexp = re.compile(restr, re.S | re.M) 212 matches = regexp.findall(data) 213 if matches: 214 break 215 216 if not matches: 217 return ('', None, 0) # (quotechar, delimiter, skipinitialspace) 218 219 quotes = {} 220 delims = {} 221 spaces = 0 222 for m in matches: 223 n = regexp.groupindex['quote'] - 1 224 key = m[n] 225 if key: 226 quotes[key] = quotes.get(key, 0) + 1 227 try: 228 n = regexp.groupindex['delim'] - 1 229 key = m[n] 230 except KeyError: 231 continue 232 if key and (delimiters is None or key in delimiters): 233 delims[key] = delims.get(key, 0) + 1 234 try: 235 n = regexp.groupindex['space'] - 1 236 except KeyError: 237 continue 238 if m[n]: 239 spaces += 1 240 241 quotechar = reduce(lambda a, b, quotes = quotes: 242 (quotes[a] > quotes[b]) and a or b, quotes.keys()) 243 244 if delims: 245 delim = reduce(lambda a, b, delims = delims: 246 (delims[a] > delims[b]) and a or b, delims.keys()) 247 skipinitialspace = delims[delim] == spaces 248 if delim == '\n': # most likely a file with a single column 249 delim = '' 250 else: 251 # there is *no* delimiter, it's a single column of quoted data 252 delim = '' 253 skipinitialspace = 0 254 255 return (quotechar, delim, skipinitialspace)
256 257
258 - def _guess_delimiter(self, data, delimiters):
259 """ 260 The delimiter /should/ occur the same number of times on 261 each row. However, due to malformed data, it may not. We don't want 262 an all or nothing approach, so we allow for small variations in this 263 number. 264 1) build a table of the frequency of each character on every line. 265 2) build a table of freqencies of this frequency (meta-frequency?), 266 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, 267 7 times in 2 rows' 268 3) use the mode of the meta-frequency to determine the /expected/ 269 frequency for that character 270 4) find out how often the character actually meets that goal 271 5) the character that best meets its goal is the delimiter 272 For performance reasons, the data is evaluated in chunks, so it can 273 try and evaluate the smallest portion of the data possible, evaluating 274 additional chunks as necessary. 275 """ 276 277 data = filter(None, data.split('\n')) 278 279 ascii = [chr(c) for c in range(127)] # 7-bit ASCII 280 281 # build frequency tables 282 chunkLength = min(10, len(data)) 283 iteration = 0 284 charFrequency = {} 285 modes = {} 286 delims = {} 287 start, end = 0, min(chunkLength, len(data)) 288 while start < len(data): 289 iteration += 1 290 for line in data[start:end]: 291 for char in ascii: 292 metaFrequency = charFrequency.get(char, {}) 293 # must count even if frequency is 0 294 freq = line.strip().count(char) 295 # value is the mode 296 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 297 charFrequency[char] = metaFrequency 298 299 for char in charFrequency.keys(): 300 items = charFrequency[char].items() 301 if len(items) == 1 and items[0][0] == 0: 302 continue 303 # get the mode of the frequencies 304 if len(items) > 1: 305 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, 306 items) 307 # adjust the mode - subtract the sum of all 308 # other frequencies 309 items.remove(modes[char]) 310 modes[char] = (modes[char][0], modes[char][1] 311 - reduce(lambda a, b: (0, a[1] + b[1]), 312 items)[1]) 313 else: 314 modes[char] = items[0] 315 316 # build a list of possible delimiters 317 modeList = modes.items() 318 total = float(chunkLength * iteration) 319 # (rows of consistent data) / (number of rows) = 100% 320 consistency = 1.0 321 # minimum consistency threshold 322 threshold = 0.9 323 while len(delims) == 0 and consistency >= threshold: 324 for k, v in modeList: 325 if v[0] > 0 and v[1] > 0: 326 if ((v[1]/total) >= consistency and 327 (delimiters is None or k in delimiters)): 328 delims[k] = v 329 consistency -= 0.01 330 331 if len(delims) == 1: 332 delim = delims.keys()[0] 333 skipinitialspace = (data[0].count(delim) == 334 data[0].count("%c " % delim)) 335 return (delim, skipinitialspace) 336 337 # analyze another chunkLength lines 338 start = end 339 end += chunkLength 340 341 if not delims: 342 return ('', 0) 343 344 # if there's more than one, fall back to a 'preferred' list 345 if len(delims) > 1: 346 for d in self.preferred: 347 if d in delims.keys(): 348 skipinitialspace = (data[0].count(d) == 349 data[0].count("%c " % d)) 350 return (d, skipinitialspace) 351 352 # finally, just return the first damn character in the list 353 delim = delims.keys()[0] 354 skipinitialspace = (data[0].count(delim) == 355 data[0].count("%c " % delim)) 356 return (delim, skipinitialspace)
357 358
359 - def has_header(self, sample):
360 # Creates a dictionary of types of data in each column. If any 361 # column is of a single type (say, integers), *except* for the first 362 # row, then the first row is presumed to be labels. If the type 363 # can't be determined, it is assumed to be a string in which case 364 # the length of the string is the determining factor: if all of the 365 # rows except for the first are the same length, it's a header. 366 # Finally, a 'vote' is taken at the end for each column, adding or 367 # subtracting from the likelihood of the first row being a header. 368 369 rdr = reader(StringIO(sample), self.sniff(sample)) 370 371 header = rdr.next() # assume first row is header 372 373 columns = len(header) 374 columnTypes = {} 375 for i in range(columns): columnTypes[i] = None 376 377 checked = 0 378 for row in rdr: 379 # arbitrary number of rows to check, to keep it sane 380 if checked > 20: 381 break 382 checked += 1 383 384 if len(row) != columns: 385 continue # skip rows that have irregular number of columns 386 387 for col in columnTypes.keys(): 388 389 for thisType in [int, long, float, complex]: 390 try: 391 thisType(row[col]) 392 break 393 except (ValueError, OverflowError): 394 pass 395 else: 396 # fallback to length of string 397 thisType = len(row[col]) 398 399 # treat longs as ints 400 if thisType == long: 401 thisType = int 402 403 if thisType != columnTypes[col]: 404 if columnTypes[col] is None: # add new column type 405 columnTypes[col] = thisType 406 else: 407 # type is inconsistent, remove column from 408 # consideration 409 del columnTypes[col] 410 411 # finally, compare results against first row and "vote" 412 # on whether it's a header 413 hasHeader = 0 414 for col, colType in columnTypes.items(): 415 if type(colType) == type(0): # it's a length 416 if len(header[col]) != colType: 417 hasHeader += 1 418 else: 419 hasHeader -= 1 420 else: # attempt typecast 421 try: 422 colType(header[col]) 423 except (ValueError, TypeError): 424 hasHeader += 1 425 else: 426 hasHeader -= 1 427 428 return hasHeader > 0
429