224 lines
No EOL
5.7 KiB
Lua
224 lines
No EOL
5.7 KiB
Lua
-- Markdown content parsing module
|
|
local M = {}
|
|
|
|
local utils = require('notex.utils')
|
|
|
|
-- Extract content between lines
|
|
function M.extract_content_between(content, start_pattern, end_pattern)
|
|
local start_pos = content:find(start_pattern)
|
|
if not start_pos then
|
|
return nil
|
|
end
|
|
|
|
local end_pos = content:find(end_pattern, start_pos)
|
|
if not end_pos then
|
|
return content:sub(start_pos)
|
|
end
|
|
|
|
return content:sub(start_pos, end_pos)
|
|
end
|
|
|
|
-- Remove YAML header from content
|
|
function M.remove_yaml_header(content)
|
|
return content:gsub("^%s*%-%-%-\n.-\n%-%-%-\n", "", 1)
|
|
end
|
|
|
|
-- Extract markdown body (content after YAML header)
|
|
function M.get_body(content)
|
|
return M.remove_yaml_header(content)
|
|
end
|
|
|
|
-- Count words in markdown content
|
|
function M.count_words(content)
|
|
local body = M.get_body(content)
|
|
if not body then
|
|
return 0
|
|
end
|
|
|
|
-- Remove markdown syntax for accurate word count
|
|
local clean = body
|
|
:gsub("#+ ", "") -- Headers
|
|
:gsub("%*%*(.-)%*%*", "%1") -- Bold
|
|
:gsub("%*(.-)%*", "%1") -- Italic
|
|
:gsub("`(.-)`", "%1") -- Inline code
|
|
:gsub("```.-```", "") -- Code blocks
|
|
:gsub("%[.-%]%(.-%)", "") -- Links
|
|
:gsub("%!%[.-%]%(.-%)", "") -- Images
|
|
:gsub("%W+", " ") -- Replace non-word chars with spaces
|
|
:gsub("%s+", " ") -- Collapse multiple spaces
|
|
|
|
local words = {}
|
|
for word in clean:gmatch("%S+") do
|
|
if #word > 0 then
|
|
table.insert(words, word)
|
|
end
|
|
end
|
|
|
|
return #words
|
|
end
|
|
|
|
-- Count characters in markdown content
|
|
function M.count_characters(content)
|
|
local body = M.get_body(content)
|
|
return body and #body or 0
|
|
end
|
|
|
|
-- Extract headings from markdown
|
|
function M.extract_headings(content)
|
|
local headings = {}
|
|
local body = M.get_body(content)
|
|
|
|
for level, title in body:gmatch("^(#+)%s+(.+)$") do
|
|
table.insert(headings, {
|
|
level = #level,
|
|
title = title:trim(),
|
|
raw = level .. " " .. title
|
|
})
|
|
end
|
|
|
|
return headings
|
|
end
|
|
|
|
-- Extract links from markdown
|
|
function M.extract_links(content)
|
|
local links = {}
|
|
|
|
for text, url in content:gmatch("%[([^%]]*)%]%(([^)]+)%)") do
|
|
table.insert(links, {
|
|
text = text,
|
|
url = url,
|
|
raw = "[" .. text .. "](" .. url .. ")"
|
|
})
|
|
end
|
|
|
|
return links
|
|
end
|
|
|
|
-- Extract code blocks from markdown
|
|
function M.extract_code_blocks(content)
|
|
local code_blocks = {}
|
|
|
|
for lang, code in content:gmatch("```(%w*)\n(.-)\n```") do
|
|
table.insert(code_blocks, {
|
|
language = lang ~= "" and lang or "text",
|
|
code = code,
|
|
lines = select(2, code:gsub("\n", "")) + 1
|
|
})
|
|
end
|
|
|
|
return code_blocks
|
|
end
|
|
|
|
-- Get content summary (first paragraph)
|
|
function M.get_summary(content, max_length)
|
|
local body = M.get_body(content)
|
|
if not body then
|
|
return ""
|
|
end
|
|
|
|
-- Remove code blocks to avoid including them in summary
|
|
local clean_body = body:gsub("```.-```", "")
|
|
|
|
-- Extract first paragraph
|
|
local first_paragraph = clean_body:match("\n\n([^%[].-)\n\n") or
|
|
clean_body:match("^([^%[].-)\n\n") or
|
|
clean_body:match("^([^%[].-)")
|
|
|
|
if not first_paragraph then
|
|
return ""
|
|
end
|
|
|
|
-- Clean up markdown formatting
|
|
local summary = first_paragraph
|
|
:gsub("#+ ", "")
|
|
:gsub("%*%*(.-)%*%*", "%1")
|
|
:gsub("%*(.-)%*", "%1")
|
|
:gsub("`(.-)`", "%1")
|
|
:gsub("%[.-%]%(.-%)", "")
|
|
|
|
summary = summary:gsub("%s+", " "):trim()
|
|
|
|
if #summary > max_length then
|
|
summary = summary:sub(1, max_length - 3) .. "..."
|
|
end
|
|
|
|
return summary
|
|
end
|
|
|
|
-- Analyze markdown structure
|
|
function M.analyze_structure(content)
|
|
local body = M.get_body(content)
|
|
|
|
return {
|
|
word_count = M.count_words(content),
|
|
character_count = M.count_characters(content),
|
|
headings = M.extract_headings(content),
|
|
links = M.extract_links(content),
|
|
code_blocks = M.extract_code_blocks(content),
|
|
summary = M.get_summary(content, 200),
|
|
line_count = select(2, body:gsub("\n", "")) + 1,
|
|
has_toc = body:find("^%s*%[TOC%]") ~= nil,
|
|
reading_time_minutes = math.ceil(M.count_words(content) / 200) -- Assuming 200 WPM
|
|
}
|
|
end
|
|
|
|
-- Validate markdown format
|
|
function M.validate_markdown(content)
|
|
local errors = {}
|
|
|
|
if not content or content == "" then
|
|
table.insert(errors, "Empty content")
|
|
return errors
|
|
end
|
|
|
|
-- Check for balanced markdown syntax
|
|
local function check_balance(content, open, close)
|
|
local count_open = select(2, content:gsub(open, ""))
|
|
local count_close = select(2, content:gsub(close, ""))
|
|
return count_open == count_close
|
|
end
|
|
|
|
-- Check balanced headers
|
|
local headers = content:match("#+")
|
|
if headers and not check_balance(content, "```", "```") then
|
|
table.insert(errors, "Unbalanced code blocks")
|
|
end
|
|
|
|
-- Check for malformed links
|
|
for link in content:gmatch("%[.-%]%(.-%)") do
|
|
if not link:match("%[.-%]%(([^)]+)%)") or
|
|
link:match("%[.-%]%(%s*%)") then
|
|
table.insert(errors, "Malformed link: " .. link)
|
|
end
|
|
end
|
|
|
|
return errors
|
|
end
|
|
|
|
-- Convert markdown to plain text
|
|
function M.to_plain_text(content)
|
|
local body = M.get_body(content)
|
|
if not body then
|
|
return ""
|
|
end
|
|
|
|
local plain = body
|
|
:gsub("^#%s+", "\n") -- Headers to newlines
|
|
:gsub("\n#%s+", "\n") -- Headers to newlines
|
|
:gsub("%*%*(.-)%*%*", "%1") -- Bold
|
|
:gsub("%*(.-)%*", "%1") -- Italic
|
|
:gsub("`(.-)`", "%1") -- Inline code
|
|
:gsub("```%w*\n(.-)\n```", "%1") -- Code blocks
|
|
:gsub("%[([^%]]*)%]%(([^)]+)%)", "%1") -- Links to text
|
|
:gsub("%!%[([^%]]*)%]%(([^)]+)%)", "[Image: %1]") -- Images
|
|
:gsub("\n%s*[-*+]%s+", "\n• ") -- List items
|
|
:gsub("\n%s*%d+%.%s+", "\n• ") -- Numbered lists
|
|
:gsub("\n%s*%[%s*%]%s+", "\n• ") -- Checkbox lists
|
|
:gsub("\n%s*%[%s*x%s*%]%s+", "\n✓ ") -- Checked items
|
|
:gsub("\n\n+", "\n\n") -- Multiple newlines
|
|
:trim()
|
|
|
|
return plain
|
|
end
|
|
|
|
return M |