From 151a53691ca6d628a2b6605c9368a18cd752217b Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Sun, 17 Mar 2024 12:40:49 -0400 Subject: [PATCH] Add the regex-free version of the parser --- README.md | 6 ++++-- csv_parse_no_regex.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 csv_parse_no_regex.py diff --git a/README.md b/README.md index 0fd906a..23e31b7 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,10 @@ than employ a multi-line function definition, which looks a little sloppy. Stil pretty happy with the result, especially given that conciseness was an explicit goal. If you want to take a look, find the file in [`csv_parse.py`](./csv_parse.py) or try -cloning and running it. The sample data in this repository is stolen from [this handy -repo of sample CSV files](https://github.com/datablist/sample-csv-files). +cloning and running it. I also put together another version which doesn't use python's +regex module, which you can find in [`csv_parse_no_regex.py`](./csv_parse_no_regex.py). +The sample data in this repository is stolen from [this handy repo of sample CSV +files](https://github.com/datablist/sample-csv-files). It's worth noting that this was built purely as an exercise. While it is spec compliant and can theoretically actually be used, I would recommend using Python's built-in CSV diff --git a/csv_parse_no_regex.py b/csv_parse_no_regex.py new file mode 100644 index 0000000..8f2283e --- /dev/null +++ b/csv_parse_no_regex.py @@ -0,0 +1,39 @@ +from functools import reduce + +cons = lambda h, t: [h, *t] +postpend = lambda l, e: [*l, e] + +parse_tok = lambda t: lambda s: [(t, s[len(t):])] if s.startswith(t) else [] +parse_except = lambda e: lambda s: [(s[0], s[1:])] if len(s) > 0 and s[0] not in e else [] +parse_eof = lambda s: [('', "")]if len(s) == 0 else [] + +parse_altl = lambda a, b: lambda s: (lambda ra: ra if len(ra) > 0 else b(s))(a(s)) +parse_seq = lambda f: lambda a, b: lambda s: [(f(v1, v2), r2) for (v1, r1) in a(s) for (v2, r2) in b(r1)] +parse_seql = parse_seq(lambda l, r: l) +parse_seqr = parse_seq(lambda l, r: r) +parse_pure = lambda v: lambda s: [(v, s)] +parse_map = lambda f, a: lambda s: [(f(v), r) for (v, r) in a(s)] +parse_many = lambda a: parse_altl(parse_seq(cons)(a, lambda s: parse_many(a)(s)), parse_pure([])) +parse_many_sep = lambda sep: lambda a: parse_altl(parse_seq(postpend)(parse_many(parse_seql(a, sep)), a), parse_pure([])) +parse_any_tok = lambda *ts: reduce(parse_altl, [parse_tok(t) for t in ts], lambda s: []) + +parse_whitespace = parse_many(parse_tok(' ')) +parse_dbqt = parse_tok('"') +parse_str_char = parse_except('"') +parse_newline = parse_any_tok('\r\n', '\n', '\r') +parse_escaped_quote = parse_map(lambda _: '"', parse_tok('""')) +parse_quoted = parse_seql(parse_seqr(parse_seqr(parse_whitespace, parse_dbqt), parse_many(parse_altl(parse_str_char, parse_escaped_quote))), parse_seql(parse_dbqt, parse_whitespace)) +parse_unquoted = parse_many(parse_except('"\r\n,')) +parse_field = parse_map(lambda chars: ''.join(chars), parse_altl(parse_quoted, parse_unquoted)) +parse_line = parse_many_sep(parse_tok(','))(parse_field) +parse_lines = parse_many_sep(parse_newline)(parse_line) +trim_final_empty_line = lambda lines: lines[:-1] if len(lines)>0 and lines[-1] in [[""],[]] else lines +parse_csv = parse_seql(parse_map(trim_final_empty_line, parse_lines), parse_eof) + +def read_csv_file(file_path): + with open(file_path, 'r') as file: + parse_results = parse_csv(file.read()) + return parse_results[0][0] if len(parse_results) else None + +parsed_table = read_csv_file("./my_data.csv") +print(f'Row 3, Column D reads: {parsed_table[2][3]}')