Skip to content

Commit 9565d52

Browse files
authored
feat: fallback static parsing for completely invalid notebooks (#8723)
## 📝 Summary Allows notebooks with broken syntax to be parsed by extracting the "cell boundaries" (i.e. @app.cell) and leveraging indentation. Possibly a bit heavy handed, but leverages a simple state machine to work through the tokens to find the boundaries. NB, this is _ONLY_ a fallback mechanism, but should prevent breakage in vs-code/ watch when the source file breaks.
1 parent 2ed144a commit 9565d52

9 files changed

Lines changed: 1439 additions & 48 deletions

File tree

marimo/_ast/load.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def _maybe_contents(filename: Optional[Union[str, Path]]) -> Optional[str]:
5858
if filename is None:
5959
return None
6060

61-
return Path(filename).read_text(encoding="utf-8").strip()
61+
return Path(filename).read_text(encoding="utf-8", errors="replace").strip()
6262

6363

6464
def find_cell(filename: str, lineno: int) -> CellDef | None:

marimo/_ast/parse.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -511,14 +511,27 @@ def from_file(filename: Union[str, Path]) -> Parser:
511511
def __init__(self, contents: str, filepath: str = "<marimo>"):
512512
self.extractor = Extractor(contents=contents)
513513
self.filepath = filepath
514+
self._scanner_generated_lines: frozenset[int] = frozenset()
514515

515516
def node_stack(self) -> PeekStack[Node]:
516-
tree = ast_parse(
517-
self.extractor.contents or "",
518-
filename=self.filepath,
519-
suppress_warnings=False,
520-
)
521-
return PeekStack(iter(tree.body))
517+
try:
518+
tree = ast_parse(
519+
self.extractor.contents or "",
520+
filename=self.filepath,
521+
suppress_warnings=False,
522+
)
523+
return PeekStack(iter(tree.body))
524+
except SyntaxError:
525+
# File has syntax errors — use scanner to recover individual cells.
526+
# Never re-raise: parse_notebook must return a best-effort result
527+
# so --watch and IPC are never broken by a syntax error.
528+
from marimo._ast.scanner import scan_parse_fallback
529+
530+
nodes, scanner_lines = scan_parse_fallback(
531+
self.extractor.contents or "", self.filepath
532+
)
533+
self._scanner_generated_lines = scanner_lines
534+
return PeekStack(iter(nodes))
522535

523536
def parse_header(self, body: PeekStack[Node]) -> ParseResult[Header]:
524537
# header? = (docstring | comments)*
@@ -662,7 +675,20 @@ def parse_body(self, body: PeekStack[Node]) -> ParseResult[list[CellDef]]:
662675
if is_body_cell(node):
663676
cell_result = self.extractor.to_cell(node)
664677
violations.extend(cell_result.violations)
665-
cells.append(cell_result.unwrap())
678+
cell = cell_result.unwrap()
679+
# Scanner-generated unparsable cells indicate a syntax
680+
# error in the original cell source.
681+
if (
682+
isinstance(cell, UnparsableCell)
683+
and node.lineno in self._scanner_generated_lines
684+
):
685+
violations.append(
686+
Violation(
687+
SCANNER_UNPARSABLE_CELL_VIOLATION,
688+
lineno=node.lineno,
689+
)
690+
)
691+
cells.append(cell)
666692
elif is_run_guard(node):
667693
break
668694
else:
@@ -1157,6 +1183,9 @@ def parse_notebook(
11571183
ONLY_HEADER_EXTRACTED_VIOLATION = "Only able to extract header."
11581184
NON_MARIMO_PYTHON_SCRIPT_VIOLATION = "non-marimo Python content beyond header"
11591185
EXPECTED_RUN_GUARD_VIOLATION = "Expected run guard statement"
1186+
SCANNER_UNPARSABLE_CELL_VIOLATION = (
1187+
"Cell contains a syntax error and could not be parsed"
1188+
)
11601189

11611190
# Soft violations are auto-corrected on save with no data loss.
11621191
# Any violation NOT in this set is considered "hard" (potential data loss).

0 commit comments

Comments
 (0)