// 安装NuGet包:Install-Package PdfPig
using UglyToad.PdfPig;
using System.Text;
using System.Collections.Generic;
public class PdfPigTableExtractor
{
public static string ExtractTables(string pdfPath)
{
var html = new StringBuilder();
html.AppendLine("<table border='1' style='border-collapse:collapse'>");
using (var document = PdfDocument.Open(pdfPath))
{
foreach (var page in document.GetPages())
{
var words = page.GetWords();
var lines = GroupWordsIntoLines(words);
foreach (var line in lines)
{
html.AppendLine("<tr>");
foreach (var word in line)
{
html.Append($"<td>{word.Text}</td>");
}
html.AppendLine("</tr>");
}
}
}
html.AppendLine("</table>");
return html.ToString();
}
private static List<List<Word>> GroupWordsIntoLines(IEnumerable<Word> words)
{
// 按Y坐标分组(同一行)
var grouped = new Dictionary<double, List<Word>>();
foreach (var word in words)
{
var y = Math.Round(word.BoundingBox.Bottom, 1);
if (!grouped.ContainsKey(y)) grouped[y] = new List<Word>();
grouped[y].Add(word);
}
// 按行排序(从上到下)
return grouped.OrderByDescending(g => g.Key)
.Select(g => g.Value.OrderBy(w => w.BoundingBox.Left).ToList())
.ToList();
}
}
// 使用示例
string html = PdfPigTableExtractor.ExtractTables("input.pdf");
File.WriteAllText("table.html", html);