Add the regex-free version of the parser

This commit is contained in:
Emi Simpson 2024-03-17 12:40:49 -04:00
parent 7be3c2311b
commit 151a53691c
Signed by: Emi
GPG Key ID: A12F2C2FFDC3D847
2 changed files with 43 additions and 2 deletions

View File

@ -11,8 +11,10 @@ than employ a multi-line function definition, which looks a little sloppy. Stil
pretty happy with the result, especially given that conciseness was an explicit goal. pretty happy with the result, especially given that conciseness was an explicit goal.
If you want to take a look, find the file in [`csv_parse.py`](./csv_parse.py) or try If you want to take a look, find the file in [`csv_parse.py`](./csv_parse.py) or try
cloning and running it. The sample data in this repository is stolen from [this handy cloning and running it. I also put together another version which doesn't use python's
repo of sample CSV files](https://github.com/datablist/sample-csv-files). regex module, which you can find in [`csv_parse_no_regex.py`](./csv_parse_no_regex.py).
The sample data in this repository is stolen from [this handy repo of sample CSV
files](https://github.com/datablist/sample-csv-files).
It's worth noting that this was built purely as an exercise. While it is spec compliant It's worth noting that this was built purely as an exercise. While it is spec compliant
and can theoretically actually be used, I would recommend using Python's built-in CSV and can theoretically actually be used, I would recommend using Python's built-in CSV

39
csv_parse_no_regex.py Normal file
View File

@ -0,0 +1,39 @@
from functools import reduce
cons = lambda h, t: [h, *t]
postpend = lambda l, e: [*l, e]
parse_tok = lambda t: lambda s: [(t, s[len(t):])] if s.startswith(t) else []
parse_except = lambda e: lambda s: [(s[0], s[1:])] if len(s) > 0 and s[0] not in e else []
parse_eof = lambda s: [('<EOF>', "")]if len(s) == 0 else []
parse_altl = lambda a, b: lambda s: (lambda ra: ra if len(ra) > 0 else b(s))(a(s))
parse_seq = lambda f: lambda a, b: lambda s: [(f(v1, v2), r2) for (v1, r1) in a(s) for (v2, r2) in b(r1)]
parse_seql = parse_seq(lambda l, r: l)
parse_seqr = parse_seq(lambda l, r: r)
parse_pure = lambda v: lambda s: [(v, s)]
parse_map = lambda f, a: lambda s: [(f(v), r) for (v, r) in a(s)]
parse_many = lambda a: parse_altl(parse_seq(cons)(a, lambda s: parse_many(a)(s)), parse_pure([]))
parse_many_sep = lambda sep: lambda a: parse_altl(parse_seq(postpend)(parse_many(parse_seql(a, sep)), a), parse_pure([]))
parse_any_tok = lambda *ts: reduce(parse_altl, [parse_tok(t) for t in ts], lambda s: [])
parse_whitespace = parse_many(parse_tok(' '))
parse_dbqt = parse_tok('"')
parse_str_char = parse_except('"')
parse_newline = parse_any_tok('\r\n', '\n', '\r')
parse_escaped_quote = parse_map(lambda _: '"', parse_tok('""'))
parse_quoted = parse_seql(parse_seqr(parse_seqr(parse_whitespace, parse_dbqt), parse_many(parse_altl(parse_str_char, parse_escaped_quote))), parse_seql(parse_dbqt, parse_whitespace))
parse_unquoted = parse_many(parse_except('"\r\n,'))
parse_field = parse_map(lambda chars: ''.join(chars), parse_altl(parse_quoted, parse_unquoted))
parse_line = parse_many_sep(parse_tok(','))(parse_field)
parse_lines = parse_many_sep(parse_newline)(parse_line)
trim_final_empty_line = lambda lines: lines[:-1] if len(lines)>0 and lines[-1] in [[""],[]] else lines
parse_csv = parse_seql(parse_map(trim_final_empty_line, parse_lines), parse_eof)
def read_csv_file(file_path):
with open(file_path, 'r') as file:
parse_results = parse_csv(file.read())
return parse_results[0][0] if len(parse_results) else None
parsed_table = read_csv_file("./my_data.csv")
print(f'Row 3, Column D reads: {parsed_table[2][3]}')