1
2
3 """
4 csv.py - read/write/investigate CSV files
5 """
6
7 import re
8 from _csv import Error, __version__, writer, reader, register_dialect, \
9 unregister_dialect, get_dialect, list_dialects, \
10 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
12
13 try:
14 from cStringIO import StringIO
15 except ImportError:
16 from StringIO import StringIO
17
18 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
19 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
20 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
21 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
22
24 _name = ""
25 _valid = False
26
27 delimiter = None
28 quotechar = None
29 escapechar = None
30 doublequote = None
31 skipinitialspace = None
32 lineterminator = None
33 quoting = None
34
36 if self.__class__ != Dialect:
37 self._valid = True
38 errors = self._validate()
39 if errors != []:
40 raise Error, "Dialect did not validate: %s" % ", ".join(errors)
41
43 errors = []
44 if not self._valid:
45 errors.append("can't directly instantiate Dialect class")
46
47 if self.delimiter is None:
48 errors.append("delimiter character not set")
49 elif (not isinstance(self.delimiter, str) or
50 len(self.delimiter) > 1):
51 errors.append("delimiter must be one-character string")
52
53 if self.quotechar is None:
54 if self.quoting != QUOTE_NONE:
55 errors.append("quotechar not set")
56 elif (not isinstance(self.quotechar, str) or
57 len(self.quotechar) > 1):
58 errors.append("quotechar must be one-character string")
59
60 if self.lineterminator is None:
61 errors.append("lineterminator not set")
62 elif not isinstance(self.lineterminator, str):
63 errors.append("lineterminator must be a string")
64
65 if self.doublequote not in (True, False):
66 errors.append("doublequote parameter must be True or False")
67
68 if self.skipinitialspace not in (True, False):
69 errors.append("skipinitialspace parameter must be True or False")
70
71 if self.quoting is None:
72 errors.append("quoting parameter not set")
73
74 if self.quoting is QUOTE_NONE:
75 if (not isinstance(self.escapechar, (unicode, str)) or
76 len(self.escapechar) > 1):
77 errors.append("escapechar must be a one-character string or unicode object")
78
79 return errors
80
88 register_dialect("excel", excel)
89
92 register_dialect("excel-tab", excel_tab)
93
94
96 - def __init__(self, f, fieldnames, restkey=None, restval=None,
97 dialect="excel", *args, **kwds):
98 self.fieldnames = fieldnames
99 self.restkey = restkey
100 self.restval = restval
101 self.reader = reader(f, dialect, *args, **kwds)
102
105
107 row = self.reader.next()
108
109
110
111 while row == []:
112 row = self.reader.next()
113 d = dict(zip(self.fieldnames, row))
114 lf = len(self.fieldnames)
115 lr = len(row)
116 if lf < lr:
117 d[self.restkey] = row[lf:]
118 elif lf > lr:
119 for key in self.fieldnames[lr:]:
120 d[key] = self.restval
121 return d
122
123
125 - def __init__(self, f, fieldnames, restval="", extrasaction="raise",
126 dialect="excel", *args, **kwds):
127 self.fieldnames = fieldnames
128 self.restval = restval
129 if extrasaction.lower() not in ("raise", "ignore"):
130 raise ValueError, \
131 ("extrasaction (%s) must be 'raise' or 'ignore'" %
132 extrasaction)
133 self.extrasaction = extrasaction
134 self.writer = writer(f, dialect, *args, **kwds)
135
137 if self.extrasaction == "raise":
138 for k in rowdict.keys():
139 if k not in self.fieldnames:
140 raise ValueError, "dict contains fields not in fieldnames"
141 return [rowdict.get(key, self.restval) for key in self.fieldnames]
142
145
151
152
153 try:
154 complex
155 except NameError:
156 complex = float
157
159 '''
160 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
161 Returns a Dialect object.
162 '''
164
165 self.preferred = [',', '\t', ';', ' ', ':']
166
167
168 - def sniff(self, sample, delimiters=None):
185
186 dialect.delimiter = delimiter
187
188 dialect.quotechar = quotechar or '"'
189 dialect.skipinitialspace = skipinitialspace
190
191 return dialect
192
193
195 """
196 Looks for text enclosed between two identical quotes
197 (the probable quotechar) which are preceded and followed
198 by the same character (the probable delimiter).
199 For example:
200 ,'some text',
201 The quote with the most wins, same with the delimiter.
202 If there is no quotechar the delimiter can't be determined
203 this way.
204 """
205
206 matches = []
207 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)',
208 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',
209 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',
210 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):
211 regexp = re.compile(restr, re.S | re.M)
212 matches = regexp.findall(data)
213 if matches:
214 break
215
216 if not matches:
217 return ('', None, 0)
218
219 quotes = {}
220 delims = {}
221 spaces = 0
222 for m in matches:
223 n = regexp.groupindex['quote'] - 1
224 key = m[n]
225 if key:
226 quotes[key] = quotes.get(key, 0) + 1
227 try:
228 n = regexp.groupindex['delim'] - 1
229 key = m[n]
230 except KeyError:
231 continue
232 if key and (delimiters is None or key in delimiters):
233 delims[key] = delims.get(key, 0) + 1
234 try:
235 n = regexp.groupindex['space'] - 1
236 except KeyError:
237 continue
238 if m[n]:
239 spaces += 1
240
241 quotechar = reduce(lambda a, b, quotes = quotes:
242 (quotes[a] > quotes[b]) and a or b, quotes.keys())
243
244 if delims:
245 delim = reduce(lambda a, b, delims = delims:
246 (delims[a] > delims[b]) and a or b, delims.keys())
247 skipinitialspace = delims[delim] == spaces
248 if delim == '\n':
249 delim = ''
250 else:
251
252 delim = ''
253 skipinitialspace = 0
254
255 return (quotechar, delim, skipinitialspace)
256
257
259 """
260 The delimiter /should/ occur the same number of times on
261 each row. However, due to malformed data, it may not. We don't want
262 an all or nothing approach, so we allow for small variations in this
263 number.
264 1) build a table of the frequency of each character on every line.
265 2) build a table of freqencies of this frequency (meta-frequency?),
266 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
267 7 times in 2 rows'
268 3) use the mode of the meta-frequency to determine the /expected/
269 frequency for that character
270 4) find out how often the character actually meets that goal
271 5) the character that best meets its goal is the delimiter
272 For performance reasons, the data is evaluated in chunks, so it can
273 try and evaluate the smallest portion of the data possible, evaluating
274 additional chunks as necessary.
275 """
276
277 data = filter(None, data.split('\n'))
278
279 ascii = [chr(c) for c in range(127)]
280
281
282 chunkLength = min(10, len(data))
283 iteration = 0
284 charFrequency = {}
285 modes = {}
286 delims = {}
287 start, end = 0, min(chunkLength, len(data))
288 while start < len(data):
289 iteration += 1
290 for line in data[start:end]:
291 for char in ascii:
292 metaFrequency = charFrequency.get(char, {})
293
294 freq = line.strip().count(char)
295
296 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
297 charFrequency[char] = metaFrequency
298
299 for char in charFrequency.keys():
300 items = charFrequency[char].items()
301 if len(items) == 1 and items[0][0] == 0:
302 continue
303
304 if len(items) > 1:
305 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
306 items)
307
308
309 items.remove(modes[char])
310 modes[char] = (modes[char][0], modes[char][1]
311 - reduce(lambda a, b: (0, a[1] + b[1]),
312 items)[1])
313 else:
314 modes[char] = items[0]
315
316
317 modeList = modes.items()
318 total = float(chunkLength * iteration)
319
320 consistency = 1.0
321
322 threshold = 0.9
323 while len(delims) == 0 and consistency >= threshold:
324 for k, v in modeList:
325 if v[0] > 0 and v[1] > 0:
326 if ((v[1]/total) >= consistency and
327 (delimiters is None or k in delimiters)):
328 delims[k] = v
329 consistency -= 0.01
330
331 if len(delims) == 1:
332 delim = delims.keys()[0]
333 skipinitialspace = (data[0].count(delim) ==
334 data[0].count("%c " % delim))
335 return (delim, skipinitialspace)
336
337
338 start = end
339 end += chunkLength
340
341 if not delims:
342 return ('', 0)
343
344
345 if len(delims) > 1:
346 for d in self.preferred:
347 if d in delims.keys():
348 skipinitialspace = (data[0].count(d) ==
349 data[0].count("%c " % d))
350 return (d, skipinitialspace)
351
352
353 delim = delims.keys()[0]
354 skipinitialspace = (data[0].count(delim) ==
355 data[0].count("%c " % delim))
356 return (delim, skipinitialspace)
357
358
360
361
362
363
364
365
366
367
368
369 rdr = reader(StringIO(sample), self.sniff(sample))
370
371 header = rdr.next()
372
373 columns = len(header)
374 columnTypes = {}
375 for i in range(columns): columnTypes[i] = None
376
377 checked = 0
378 for row in rdr:
379
380 if checked > 20:
381 break
382 checked += 1
383
384 if len(row) != columns:
385 continue
386
387 for col in columnTypes.keys():
388
389 for thisType in [int, long, float, complex]:
390 try:
391 thisType(row[col])
392 break
393 except (ValueError, OverflowError):
394 pass
395 else:
396
397 thisType = len(row[col])
398
399
400 if thisType == long:
401 thisType = int
402
403 if thisType != columnTypes[col]:
404 if columnTypes[col] is None:
405 columnTypes[col] = thisType
406 else:
407
408
409 del columnTypes[col]
410
411
412
413 hasHeader = 0
414 for col, colType in columnTypes.items():
415 if type(colType) == type(0):
416 if len(header[col]) != colType:
417 hasHeader += 1
418 else:
419 hasHeader -= 1
420 else:
421 try:
422 colType(header[col])
423 except (ValueError, TypeError):
424 hasHeader += 1
425 else:
426 hasHeader -= 1
427
428 return hasHeader > 0
429