Inline text parsing

This commit is contained in:
Nathan McCarty 2025-02-21 02:49:47 -05:00
parent 2033a48c3e
commit 98ed90a9bc
6 changed files with 373 additions and 15 deletions

View file

@ -16,11 +16,12 @@ authors = "Nathan McCarty"
depends = structures
, eff
, refined
, elab-util
-- modules to install
modules = SSG.Parser.Core
, SSG.Parser.Util
, SSG.Parser.Markdown
, SSG.Parser.Djot
, SSG.HTML
, SSG.HTML.ElementTypes

View file

@ -4,6 +4,9 @@ import Data.Vect
import Data.String
import Control.Eff
import Derive.Prelude
%language ElabReflection
---------------------------
-- Internal parser state --
@ -15,12 +18,14 @@ data ParserLocation : Type where
AtPoint : (length : Int) -> (location : Int) -> ParserLocation
EndOfInput : (length : Int) -> ParserLocation
%runElab derive "ParserLocation" [Show]
namespace ParserLocation
export
fromString : String -> ParserLocation
fromString str =
if strLength str > 0
then AtPoint 0 (strLength str)
then AtPoint (strLength str) 0
else EndOfInput 0
||| The state of a parser
@ -30,6 +35,8 @@ record ParserState where
input : String
location : ParserLocation
%runElab derive "ParserState" [Show]
namespace ParserState
export
fromString : String -> ParserState
@ -69,6 +76,15 @@ data Parser : Type -> Type where
-- Detect the end of input
ParseEoF : Parser Bool
Show (Parser a) where
show Save = "save"
show (Load x) = "load: \{show x}"
show (Peek n) = "peek: \{show n}]"
show (ParseChar predicate) = "ParseChar"
show (ParseExact str) = "ParseExact: \{str}"
show (SkipChars predicate) = "SkipChars"
show ParseEoF = "ParseEof"
-----------------------
-- Effectful Actions --
-----------------------
@ -136,7 +152,7 @@ unParser s@(MkPS input (AtPoint length location)) (ParseChar predicate) =
let c = assert_total $ strIndex input location
in if predicate c
then if location + 1 < length
then (Got c, MkPS input (AtPoint length location))
then (Got c, MkPS input (AtPoint length (location + 1)))
else (Got c, MkPS input (EndOfInput length))
else (Failed c, s)
unParser s@(MkPS input (EndOfInput length)) (ParseChar predicate) =

267
src/SSG/Parser/Djot.idr Normal file
View file

@ -0,0 +1,267 @@
module SSG.Parser.Djot
import SSG.Parser.Core
import SSG.Parser.Util
import Data.List1
import Data.Vect
import Data.Nat
import Data.String
import Control.Eff
import Derive.Prelude
-- For iutils unit tests
import System
%language ElabReflection
--*****************************************
--* Data Types *
--*****************************************
public export
data Inline : Type where
HardLineBreak : Inline
SoftLineBreak : Inline
NonBreakingSpace : Inline
Text : (c : Char) -> Inline
%runElab derive "Inline" [Show, Eq]
public export
data Block : Type where
--*****************************************
--* Character Classes and String Escaping *
--*****************************************
-----------------------
-- Character classes --
-----------------------
-- Class contents
punctuationChars : List Char
punctuationChars =
[
'!', '"', '#', '$', '%', '&', '\''
, '(' , ')' , '*' , '+' , ',' , '-'
, '.' , '/' , ':' , ';' , '<' , '='
, '>' , '?' , '@' , '[' , ']' , '^'
, '_' , '`' , '{' , '|' , '}' , '~'
]
horizontalWhitespaceChars : List Char
horizontalWhitespaceChars =
[' ', '\t']
verticalWhitespaceChars : List Char
verticalWhitespaceChars =
['\n', '\r']
-- Class parsers
punctuation : PS Char
punctuation = theseChars punctuationChars
--------------
-- Escaping --
--------------
escapedChar : PS Char
escapedChar = do
_ <- thisString "\\"
oneOfE "Expected an escapable code"
[ punctuation
, exactReplace "n" '\n'
, exactReplace "t" '\t'
, exactReplace "r" '\r'
]
------------------------------------
-- Line Qualifying And Whitespace --
------------------------------------
spaces : PS Nat
spaces = do
xs <- many $ theseChars horizontalWhitespaceChars
pure $ length xs
nonTerminal : PS Char
nonTerminal = notTheseChars verticalWhitespaceChars
lineEnding : PS String
lineEnding = theseStrings ["\r\n", "\n", "\r"]
terminal : PS ()
terminal = do
Nothing <- tryMaybe lineEnding
| _ => pure ()
test <- parseEoF
case test of
False => throw "Expected line terminal"
True => pure ()
line : PS (List Char)
line = do
cs <- many nonTerminal
terminal
pure cs
isHorizontalWhitespace : Char -> Bool
isHorizontalWhitespace c = any (== c) horizontalWhitespaceChars
blankLine : PS (List Char)
blankLine = do
cs <- line
case all isHorizontalWhitespace cs of
False => throw "nonblank line"
True => pure cs
--*****************************************
--* Inline syntax *
--*****************************************
inlineElement : PS Inline
------------------------
-- Escaped Whitespace --
------------------------
hardLineBreak : PS Inline
hardLineBreak = do
_ <- thisString "\\"
_ <- spaces
_ <- lineEnding
pure $ HardLineBreak
nbsp : PS Inline
nbsp = do
_ <- thisString "\\ "
pure $ NonBreakingSpace
softLineBreak : PS Inline
softLineBreak = do
_ <- lineEnding
-- Check to see if the next line is empty, if it is, we are at the end of the inline
-- content, go ahead and bail
state <- save
Nothing <- tryMaybe blankLine
| Just _ => throw "End of inline"
load state
pure $ SoftLineBreak
---------------------
-- Emphasis/Strong --
---------------------
----------
-- Text --
----------
-- Process escape codes before processing as text
escapedText : PS Inline
escapedText = do
c <- escapedChar
pure $ Text c
-- Any non-line-ending character can be part of regular text
text : PS Inline
text = do
c <- nonTerminal
pure $ Text c
---------------------------
-- Overall Inline Parser --
---------------------------
inlineElement = oneOfE "" [
hardLineBreak
, nbsp
, escapedText
, softLineBreak
-- Text is last so that anything can superseed it
, text
]
inline : PS (List1 Inline)
inline = atLeastOne "Expected Inline Content" inlineElement
--*****************************************
--* Utility Functions *
--*****************************************
------------------
-- Constructors --
------------------
namespace Inline
export
fromString : String -> List (Inline)
fromString str with (asList str)
fromString "" | [] = []
fromString (strCons c str) | (c :: x) =
Text c :: fromString str | x
--*****************************************
--* Unit Tests *
--*****************************************
-------------------------------
-- Testing Utility Functions --
-------------------------------
-- Test a parser against a golden value with the supplied input
golden : Show a => Eq a =>
(input : String) -> (reference : a) -> (parser : PS a) -> IO Bool
golden input ref parser = do
putStrLn "Input: \{input}"
putStrLn "Expected: \{show ref}"
let got = runPS parser input
case got of
Left err => do
putStrLn "Failed with error: \{err}"
pure False
Right x => do
putStrLn "Output: \{show x}"
pure $ x == ref
-------------------------
-- Inline Syntax Tests --
-------------------------
-- @@test Plain Text Hello World
inlineTextSmoke : IO Bool
inlineTextSmoke =
let input = "Hello World!" in
golden input (map Text . unpack $ input) (map forget inline)
-- @@test Escaped Text
inlineEscapedSmoke : IO Bool
inlineEscapedSmoke =
let input = "Hello\\n\\*World"
ref = fromString "Hello" ++ [Text '\n', Text '*'] ++ fromString "World"
in golden input ref (map forget inline)
-- @@test Hard Line Break
inlineHardBreakSmoke : IO Bool
inlineHardBreakSmoke =
let input = "Hello\\\nWorld"
ref = fromString "Hello" ++ [HardLineBreak] ++ fromString "World"
in golden input ref (map forget inline)
-- @@test Soft Line Break
inlineSoftBreakSmoke : IO Bool
inlineSoftBreakSmoke =
let input = "Hello\nWorld"
ref = fromString "Hello" ++ [SoftLineBreak] ++ fromString "World"
in golden input ref (map forget inline)
-- @@test Nonbreaking Space
inlineNbspSmoke : IO Bool
inlineNbspSmoke =
let input = "Hello\\ World"
ref = fromString "Hello" ++ [NonBreakingSpace] ++ fromString "World"
in golden input ref (map forget inline)

View file

@ -1,10 +0,0 @@
module SSG.Parser.Markdown
import SSG.Parser.Core
import SSG.Parser.Util
import Data.List1
import Data.Vect
import Data.Nat
import Control.Eff

View file

@ -16,6 +16,12 @@ public export
PS : Type -> Type
PS a = Eff [Parser, Except ParseError] a
||| Run a parser and extact the parsed value
export
runPS : PS a -> (str : String) -> Either String a
runPS x str =
fst . extract . runParser str . runExcept $ x
||| Attempt to run a fallible parser, backtracking on failure
export
try : (f : PS a) -> (err : ParseError -> PS a) -> PS a
@ -67,6 +73,16 @@ atLeastOne err f = do
tail <- many f
pure $ head ::: tail
||| Attempt to parse an exact string
export
thisString : String -> PS String
thisString str = do
res <- parseExact str
case res of
Got x => pure x
Failed x => throw "Expected \{show str} Got \{show x}"
EoF => throw "Unexpected EoF"
||| Attempt to parse one of a list of strings
export
theseStrings : List String -> PS String
@ -87,7 +103,9 @@ theseChars cs = do
Just [next] <- peek 1
| _ => throw "Unexpected EoF"
if any (== next) cs
then pure next
then do
_ <- parseChar (const True)
pure next
else throw "Expected one of \{show cs}, got \{show next}"
||| Attempt to parse any char not in the list
@ -98,4 +116,23 @@ notTheseChars cs = do
| _ => throw "Unexpected EoF"
if any (== next) cs
then throw "Expected not one of \{show cs}, got \{show next}"
else pure next
else do
_ <- parseChar (const True)
pure next
||| Parse an exact match, but 'replace' it with the provided value
export
exactReplace : (str : String) -> (replacement : a) -> PS a
exactReplace str replacement = do
_ <- parseExact str
pure replacement
||| Any character
export
any : PS Char
any = do
x <- parseChar (const True)
case x of
Got y => pure y
(Failed y) => throw "What?"
EoF => throw "Unexpected EoF"

View file

@ -7,3 +7,50 @@ Decided to rename =Tag= to =Html=, and =Raw= to =Text=, which makes this make se
* Parser Core
** TODO Refine =location= in =ParserLocation=
** TODO Error messages
** TODO Combinators for predictive parsing
* Djot [2/40]
:PROPERTIES:
:COOKIE_DATA: recursive
:END:
** Inline Syntax [2/18]
*** DONE Ordinary Text
*** TODO Link
*** TODO Image
*** TODO Autolink
*** TODO Verbatim
*** TODO Emphasis/strong
*** TODO Highlighted
*** TODO Super/subscript
*** TODO Insert/delete
*** TODO Smart punctuation
*** TODO Math
*** TODO Footnote reference
*** DONE Linebreak
*** TODO Comment
*** TODO Symbols
*** TODO Raw inline
*** TODO Span
*** TODO Inline attributes
** Block Syntax [0/15]
*** TODO Paragraph
*** TODO Heading
*** TODO Block quote
*** TODO List item
*** TODO List
*** TODO Code block
*** TODO Thematic break
*** TODO Raw block
*** TODO Div
*** TODO Pipe table
*** TODO Reference link
*** TODO definition
*** TODO Footnote
*** TODO Block attributes
*** TODO Links to headings
** TODO Predictive parsing
** TODO Support all types of whitespace
*** TODO Escaping
*** TODO Whitespace class
** TODO hex escapes
** TODO GFM style alerts
** TODO Group adjacent =Text=s into a =String=