notex.nvim/lua/notex/parser/markdown.lua

224 lines
5.7 KiB
Lua
Raw Permalink Normal View History

2025-10-05 20:16:33 -04:00
-- Markdown content parsing module
local M = {}
local utils = require('notex.utils')
-- Extract content between lines
function M.extract_content_between(content, start_pattern, end_pattern)
local start_pos = content:find(start_pattern)
if not start_pos then
return nil
end
local end_pos = content:find(end_pattern, start_pos)
if not end_pos then
return content:sub(start_pos)
end
return content:sub(start_pos, end_pos)
end
-- Remove YAML header from content
function M.remove_yaml_header(content)
return content:gsub("^%s*%-%-%-\n.-\n%-%-%-\n", "", 1)
end
-- Extract markdown body (content after YAML header)
function M.get_body(content)
return M.remove_yaml_header(content)
end
-- Count words in markdown content
function M.count_words(content)
local body = M.get_body(content)
if not body then
return 0
end
-- Remove markdown syntax for accurate word count
local clean = body
:gsub("#+ ", "") -- Headers
:gsub("%*%*(.-)%*%*", "%1") -- Bold
:gsub("%*(.-)%*", "%1") -- Italic
:gsub("`(.-)`", "%1") -- Inline code
:gsub("```.-```", "") -- Code blocks
:gsub("%[.-%]%(.-%)", "") -- Links
:gsub("%!%[.-%]%(.-%)", "") -- Images
:gsub("%W+", " ") -- Replace non-word chars with spaces
:gsub("%s+", " ") -- Collapse multiple spaces
local words = {}
for word in clean:gmatch("%S+") do
if #word > 0 then
table.insert(words, word)
end
end
return #words
end
-- Count characters in markdown content
function M.count_characters(content)
local body = M.get_body(content)
return body and #body or 0
end
-- Extract headings from markdown
function M.extract_headings(content)
local headings = {}
local body = M.get_body(content)
for level, title in body:gmatch("^(#+)%s+(.+)$") do
table.insert(headings, {
level = #level,
title = title:trim(),
raw = level .. " " .. title
})
end
return headings
end
-- Extract links from markdown
function M.extract_links(content)
local links = {}
for text, url in content:gmatch("%[([^%]]*)%]%(([^)]+)%)") do
table.insert(links, {
text = text,
url = url,
raw = "[" .. text .. "](" .. url .. ")"
})
end
return links
end
-- Extract code blocks from markdown
function M.extract_code_blocks(content)
local code_blocks = {}
for lang, code in content:gmatch("```(%w*)\n(.-)\n```") do
table.insert(code_blocks, {
language = lang ~= "" and lang or "text",
code = code,
lines = select(2, code:gsub("\n", "")) + 1
})
end
return code_blocks
end
-- Get content summary (first paragraph)
function M.get_summary(content, max_length)
local body = M.get_body(content)
if not body then
return ""
end
-- Remove code blocks to avoid including them in summary
local clean_body = body:gsub("```.-```", "")
-- Extract first paragraph
local first_paragraph = clean_body:match("\n\n([^%[].-)\n\n") or
clean_body:match("^([^%[].-)\n\n") or
clean_body:match("^([^%[].-)")
if not first_paragraph then
return ""
end
-- Clean up markdown formatting
local summary = first_paragraph
:gsub("#+ ", "")
:gsub("%*%*(.-)%*%*", "%1")
:gsub("%*(.-)%*", "%1")
:gsub("`(.-)`", "%1")
:gsub("%[.-%]%(.-%)", "")
summary = summary:gsub("%s+", " "):trim()
if #summary > max_length then
summary = summary:sub(1, max_length - 3) .. "..."
end
return summary
end
-- Analyze markdown structure
function M.analyze_structure(content)
local body = M.get_body(content)
return {
word_count = M.count_words(content),
character_count = M.count_characters(content),
headings = M.extract_headings(content),
links = M.extract_links(content),
code_blocks = M.extract_code_blocks(content),
summary = M.get_summary(content, 200),
line_count = select(2, body:gsub("\n", "")) + 1,
has_toc = body:find("^%s*%[TOC%]") ~= nil,
reading_time_minutes = math.ceil(M.count_words(content) / 200) -- Assuming 200 WPM
}
end
-- Validate markdown format
function M.validate_markdown(content)
local errors = {}
if not content or content == "" then
table.insert(errors, "Empty content")
return errors
end
-- Check for balanced markdown syntax
local function check_balance(content, open, close)
local count_open = select(2, content:gsub(open, ""))
local count_close = select(2, content:gsub(close, ""))
return count_open == count_close
end
-- Check balanced headers
local headers = content:match("#+")
if headers and not check_balance(content, "```", "```") then
table.insert(errors, "Unbalanced code blocks")
end
-- Check for malformed links
for link in content:gmatch("%[.-%]%(.-%)") do
if not link:match("%[.-%]%(([^)]+)%)") or
link:match("%[.-%]%(%s*%)") then
table.insert(errors, "Malformed link: " .. link)
end
end
return errors
end
-- Convert markdown to plain text
function M.to_plain_text(content)
local body = M.get_body(content)
if not body then
return ""
end
local plain = body
:gsub("^#%s+", "\n") -- Headers to newlines
:gsub("\n#%s+", "\n") -- Headers to newlines
:gsub("%*%*(.-)%*%*", "%1") -- Bold
:gsub("%*(.-)%*", "%1") -- Italic
:gsub("`(.-)`", "%1") -- Inline code
:gsub("```%w*\n(.-)\n```", "%1") -- Code blocks
:gsub("%[([^%]]*)%]%(([^)]+)%)", "%1") -- Links to text
:gsub("%!%[([^%]]*)%]%(([^)]+)%)", "[Image: %1]") -- Images
:gsub("\n%s*[-*+]%s+", "\n") -- List items
:gsub("\n%s*%d+%.%s+", "\n") -- Numbered lists
:gsub("\n%s*%[%s*%]%s+", "\n") -- Checkbox lists
:gsub("\n%s*%[%s*x%s*%]%s+", "\n") -- Checked items
:gsub("\n\n+", "\n\n") -- Multiple newlines
:trim()
return plain
end
return M