Add the regex-free version of the parser

This commit is contained in:
Emi Simpson 2024-03-17 12:40:49 -04:00
parent 7be3c2311b
commit 151a53691c
Signed by: Emi
GPG key ID: A12F2C2FFDC3D847
2 changed files with 43 additions and 2 deletions

View file

@ -11,8 +11,10 @@ than employ a multi-line function definition, which looks a little sloppy. Stil
pretty happy with the result, especially given that conciseness was an explicit goal.
If you want to take a look, find the file in [`csv_parse.py`](./csv_parse.py) or try
cloning and running it. The sample data in this repository is stolen from [this handy
repo of sample CSV files](https://github.com/datablist/sample-csv-files).
cloning and running it. I also put together another version which doesn't use python's
regex module, which you can find in [`csv_parse_no_regex.py`](./csv_parse_no_regex.py).
The sample data in this repository is stolen from [this handy repo of sample CSV
files](https://github.com/datablist/sample-csv-files).
It's worth noting that this was built purely as an exercise. While it is spec compliant
and can theoretically actually be used, I would recommend using Python's built-in CSV

39
csv_parse_no_regex.py Normal file
View file

@ -0,0 +1,39 @@
from functools import reduce
cons = lambda h, t: [h, *t]
postpend = lambda l, e: [*l, e]
parse_tok = lambda t: lambda s: [(t, s[len(t):])] if s.startswith(t) else []
parse_except = lambda e: lambda s: [(s[0], s[1:])] if len(s) > 0 and s[0] not in e else []
parse_eof = lambda s: [('<EOF>', "")]if len(s) == 0 else []
parse_altl = lambda a, b: lambda s: (lambda ra: ra if len(ra) > 0 else b(s))(a(s))
parse_seq = lambda f: lambda a, b: lambda s: [(f(v1, v2), r2) for (v1, r1) in a(s) for (v2, r2) in b(r1)]
parse_seql = parse_seq(lambda l, r: l)
parse_seqr = parse_seq(lambda l, r: r)
parse_pure = lambda v: lambda s: [(v, s)]
parse_map = lambda f, a: lambda s: [(f(v), r) for (v, r) in a(s)]
parse_many = lambda a: parse_altl(parse_seq(cons)(a, lambda s: parse_many(a)(s)), parse_pure([]))
parse_many_sep = lambda sep: lambda a: parse_altl(parse_seq(postpend)(parse_many(parse_seql(a, sep)), a), parse_pure([]))
parse_any_tok = lambda *ts: reduce(parse_altl, [parse_tok(t) for t in ts], lambda s: [])
parse_whitespace = parse_many(parse_tok(' '))
parse_dbqt = parse_tok('"')
parse_str_char = parse_except('"')
parse_newline = parse_any_tok('\r\n', '\n', '\r')
parse_escaped_quote = parse_map(lambda _: '"', parse_tok('""'))
parse_quoted = parse_seql(parse_seqr(parse_seqr(parse_whitespace, parse_dbqt), parse_many(parse_altl(parse_str_char, parse_escaped_quote))), parse_seql(parse_dbqt, parse_whitespace))
parse_unquoted = parse_many(parse_except('"\r\n,'))
parse_field = parse_map(lambda chars: ''.join(chars), parse_altl(parse_quoted, parse_unquoted))
parse_line = parse_many_sep(parse_tok(','))(parse_field)
parse_lines = parse_many_sep(parse_newline)(parse_line)
trim_final_empty_line = lambda lines: lines[:-1] if len(lines)>0 and lines[-1] in [[""],[]] else lines
parse_csv = parse_seql(parse_map(trim_final_empty_line, parse_lines), parse_eof)
def read_csv_file(file_path):
with open(file_path, 'r') as file:
parse_results = parse_csv(file.read())
return parse_results[0][0] if len(parse_results) else None
parsed_table = read_csv_file("./my_data.csv")
print(f'Row 3, Column D reads: {parsed_table[2][3]}')