Monday, February 02, 2009

Parsing comma separated files (csv) in C#

Parsing a comma separated file. This task sounded so simple for this till the time I received couple of bugs.

Initially I attempted with a simple approach as shown below.

        using (StreamReader csvFile = new StreamReader(filePath, Encoding.Default))
{
//First line must contain columns
string csvHeader = csvFile.ReadLine();

//Comma seperated values
csvColumns.InsertRange(0, csvHeader.Split(','));
}

Due to these bugs I educated myself about rules of CSV. I can quote couple of references here.

Wiki Comma-separated values and CSV standard.

Important point to note here is that each value in CSV can contain a comma or newline or quotes embedded in itself.

This simple point changes rules of games very much. Normal stream operations to read line are of no use here. As well split functions are of no use. So only option left out is to parse byte by byte watching for characters to skip and add.

Here is the code using which I managed to parse a Csv and resolved bugs. It is too primitive but works.

namespace CsvUtilities
{
using System;
using System.Collections.Generic;
using System.Data;
using System.IO;
using System.Diagnostics;
using System.Runtime.Serialization;
using System.Globalization;

public sealed class CsvParser
{
//constants
private const int quote = '"';
private const int comma = ',';
private const int carrierreturn = '\r';
private const int linefeed = '\n';

//File Name
private string fileName;

/// <summary>
/// File name propery
/// </summary>
public string FileName
{
get { return fileName; }
set { fileName = value; }
}

//shared reader
FileStream reader;

//Flag to signal end of line
private bool endofline;

#if DEBUG
public TimeSpan timeSpan;
#endif

/// <summary>
/// Default parameter less constructor
/// </summary>
public CsvParser()
{

}

/// <summary>
/// Constructor to set file
/// </summary>
/// <param name="fileName">File name to parse</param>
public CsvParser(string fileName)
{
//Input validation
if (String.IsNullOrEmpty(fileName))
throw new ArgumentException("File name can not be null");
if (File.Exists(fileName) == false)
throw new FileNotFoundException(fileName);

this.fileName = fileName;
}

/// <summary>
/// Parses Csv and return a data table
/// </summary>
/// <param name="headerIncluded">If header is not included colums will be named Column(x)</param>
/// <returns>Data table with rows populated from Csv file</returns>
public DataTable Parse(bool headerIncluded)
{
DataTable dataTable = new DataTable();
dataTable.Locale = CultureInfo.InvariantCulture;
DataRow dataRow;

//Input validation
if (String.IsNullOrEmpty(fileName))
throw new ArgumentException("File name can not be null");

#if DEBUG
Stopwatch stopwatch = new Stopwatch();
stopwatch.Start();
#endif
using (reader = new FileStream(fileName, FileMode.Open))
{
string value;
int idx = 0;

//Header handling
while (reader.Position != reader.Length)
{
value = GetValue();
if (headerIncluded == false)
{
value = "Column" + idx++;
if (endofline)
reader.Position = 0;
}

try
{
dataTable.Columns.Add(value);
}
catch (DuplicateNameException dnex)
{
throw;
}

if (endofline) break;
}

//Initialization
dataRow = dataTable.NewRow();
idx = 0;
endofline = false;

//Row handling
while (reader.Position != reader.Length)
{
value = GetValue();
dataRow[idx++] = value;

if (endofline)
{
dataTable.Rows.Add(dataRow);
dataRow = dataTable.NewRow();
idx = 0;
endofline = false;
}
}

}
#if DEBUG
stopwatch.Stop();
timeSpan = stopwatch.Elapsed;
#endif
return dataTable;
}

private string GetValue()
{
char currentByte; // Current Byte
char nextByte; // Next Byte

Boolean withinQuote = false; // Is current position within a quote

List<char> bytes = new List<char>();

long position = 0;

//If stream is null throw exception
if (reader == null)
throw new ArgumentException("CSVDataset: stream can not be null");

if (reader.CanRead == false)
throw new ArgumentException("CSVDataset: Can not read stream");

while (reader.CanRead)
{
currentByte = (char)reader.ReadByte();

position = reader.Position;

//If at last position terminate
if (position == reader.Length)
break;

//peek next character
nextByte = (char)reader.ReadByte();
//As ReadByte moved cursor ahead bring cursor back
reader.Seek(position, SeekOrigin.Begin);

//Current character is within the quote
if (withinQuote)
{
//Is this character a terminating quote
if ((currentByte == quote) && (nextByte == comma))
{
reader.Seek(position + 1, SeekOrigin.Begin); //jump comma
break;
}
if ((currentByte == quote) && (nextByte == quote))
{
continue;
}
else
{
bytes.Add(currentByte);
continue;
}
}

if (currentByte == quote)
{
withinQuote = true;
continue;
}
if (currentByte == comma) break;

if ((currentByte == carrierreturn) (currentByte == linefeed))
{
if ((nextByte == carrierreturn) (nextByte == linefeed))
{
reader.Seek(position + 1, SeekOrigin.Begin); //jump CR+LF
}
endofline = true;
break;
}

bytes.Add(currentByte);
}

//Reading value completed. Return
return new string(bytes.ToArray());
}
}
}

Hope this is useful to start with. Advanced parsers can be found at codeproject http://www.codeproject.com/KB/database/CsvReader.aspx

No comments: