Add the regex-free version of the parser

2024-03-17 12:40:49 -04:00 · 2024-03-17 12:40:49 -04:00 · 151a53691c
parent 7be3c2311b
commit 151a53691c
2 changed files with 43 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -11,8 +11,10 @@ than employ a multi-line function definition, which looks a little sloppy.  Stil
 pretty happy with the result, especially given that conciseness was an explicit goal.

 If you want to take a look, find the file in [`csv_parse.py`](./csv_parse.py) or try
-cloning and running it.  The sample data in this repository is stolen from [this handy
-repo of sample CSV files](https://github.com/datablist/sample-csv-files).
+cloning and running it.  I also put together another version which doesn't use python's
+regex module, which you can find in [`csv_parse_no_regex.py`](./csv_parse_no_regex.py).
+The sample data in this repository is stolen from [this handy repo of sample CSV
+files](https://github.com/datablist/sample-csv-files).

 It's worth noting that this was built purely as an exercise.  While it is spec compliant
 and can theoretically actually be used, I would recommend using Python's built-in CSV
--- a/csv_parse_no_regex.py
+++ b/csv_parse_no_regex.py
@ -0,0 +1,39 @@
+from functools import reduce
+
+cons = lambda h, t: [h, *t]
+postpend = lambda l, e: [*l, e]
+
+parse_tok = lambda t: lambda s: [(t, s[len(t):])] if s.startswith(t) else []
+parse_except = lambda e: lambda s: [(s[0], s[1:])] if len(s) > 0 and s[0] not in e else []
+parse_eof = lambda s: [('<EOF>', "")]if len(s) == 0 else []
+
+parse_altl = lambda a, b: lambda s: (lambda ra: ra if len(ra) > 0 else b(s))(a(s))
+parse_seq = lambda f: lambda a, b: lambda s: [(f(v1, v2), r2) for (v1, r1) in a(s) for (v2, r2) in b(r1)]
+parse_seql = parse_seq(lambda l, r: l)
+parse_seqr = parse_seq(lambda l, r: r)
+parse_pure = lambda v: lambda s: [(v, s)]
+parse_map = lambda f, a: lambda s: [(f(v), r) for (v, r) in a(s)]
+parse_many = lambda a: parse_altl(parse_seq(cons)(a, lambda s: parse_many(a)(s)), parse_pure([]))
+parse_many_sep = lambda sep: lambda a: parse_altl(parse_seq(postpend)(parse_many(parse_seql(a, sep)), a), parse_pure([]))
+parse_any_tok = lambda *ts: reduce(parse_altl, [parse_tok(t) for t in ts], lambda s: [])
+
+parse_whitespace = parse_many(parse_tok(' '))
+parse_dbqt = parse_tok('"')
+parse_str_char = parse_except('"')
+parse_newline = parse_any_tok('\r\n', '\n', '\r')
+parse_escaped_quote = parse_map(lambda _: '"', parse_tok('""'))
+parse_quoted = parse_seql(parse_seqr(parse_seqr(parse_whitespace, parse_dbqt), parse_many(parse_altl(parse_str_char, parse_escaped_quote))), parse_seql(parse_dbqt, parse_whitespace))
+parse_unquoted = parse_many(parse_except('"\r\n,'))
+parse_field = parse_map(lambda chars: ''.join(chars), parse_altl(parse_quoted, parse_unquoted))
+parse_line = parse_many_sep(parse_tok(','))(parse_field)
+parse_lines = parse_many_sep(parse_newline)(parse_line)
+trim_final_empty_line = lambda lines: lines[:-1] if len(lines)>0 and lines[-1] in [[""],[]] else lines
+parse_csv = parse_seql(parse_map(trim_final_empty_line, parse_lines), parse_eof)
+
+def read_csv_file(file_path): 
+    with open(file_path, 'r') as file:
+        parse_results = parse_csv(file.read())
+        return parse_results[0][0] if len(parse_results) else None
+
+parsed_table = read_csv_file("./my_data.csv")
+print(f'Row 3, Column D reads: {parsed_table[2][3]}')