
Originally Posted by
popcornlove
based on placing two copies of your pdf in the same directory with different names, this seems to process all the files and extract the tables. That assumes your real data is going to resemble the sample data
let
group= (filename)=>
//function to open each file and pull out all the tables in them for processing
let source = pdf.tables(file.contents(filename), [implementation="1.3"]),
list = list.union(list.transform(source[data], each table.columnnames(_))),
#"expanded data" = table.expandtablecolumn(source, "data", list,list),
#"filtered rows" = table.selectrows(#"expanded data", each text.startswith([name], "table")),
#"grouped rows" = table.group(#"filtered rows", {"name"}, {{"data", each _, type table}}),
#"added custom" = table.addcolumn(#"grouped rows", "custom", each split([data]))
in #"added custom",
//function to process each table individually
split= (variable)=> let
#"filtered rows1" = table.selectrows(variable, each ([column1] <> "" and [column1]<>null and [column1]<>"final" and [column2] <> "" and [column2]<>null)),
#"promoted headers" = table.promoteheaders(#"filtered rows1", [promoteallscalars=true]),
#"removed columns" = table.removecolumns(#"promoted headers",list.firstn(table.columnnames(#"promoted headers"),3)),
names=list.removenulls(list.transform(table.columnnames(#"removed columns"), each if text.contains(_,"volume") or text.contains(_,"unit") then _ else null)),
#"removed other columns" = table.selectcolumns(#"removed columns",names),
firstname=list.first(table.columnnames(#"removed other columns")),
#"unpivoted other columns" = table.unpivotothercolumns(#"removed other columns",{firstname} , "attribute", "value"),
#"renamed columns" = table.renamecolumns(#"unpivoted other columns",{{firstname, "category"}}),
transform = table.transformcolumns(#"renamed columns",{{"attribute",each text.beforedelimiter(_,"_"), type text}}),
//dates
repeatcount=table.rowcount(#"removed columns"),
dates = record.tolist(variable{1}),
b = list.removenulls(list.transform(dates, each try if text.contains (_,"(") and not text.contains (_,"table") then text.beforedelimiter(_, "(") else null otherwise null)),
a={list.repeat(b,repeatcount + 2)},
//distance
#"promoted headers5" = table.promoteheaders(variable, [promoteallscalars=true]),
distance = list.repeat(list.removenulls(list.transform(table.columnnames(#"promoted headers5"), each try if text.contains(_,"distance") then number.from(text.select(_,{"0".."9"})) else null otherwise null)),list.count(names)-1),distancetable= table.fromlist(distance, splitter.splitbynothing(), {"value"}, null, extravalues.error),
#"added custom3" = table.addcolumn(distancetable, "category", each "distance"),
//total
#"filtered rows5" = table.selectrows(variable, each ([column1] = "final")){0},
r=record.tolist(#"filtered rows5"),
s=list.positionof(r,"final"),
rr=list.transform(r, each if _="" then null else _),
t=list.removenulls(list.removefirstn(rr,s+1)),
totaltable= table.fromlist(t, splitter.splitbynothing(), {"value"}, null, extravalues.error),
#"added custom4" = table.addcolumn(totaltable, "category", each "final"),
//combine it all
h = table.fromcolumns(table.tocolumns(transform&#"added custom3"& #"added custom4")&a),
#"removed columns1" = table.removecolumns(h,{"column2"}),
#"changed type" = table.transformcolumntypes(#"removed columns1",{{"column1", type text}, {"column3", type number}, {"column4", type date}}),
#"pivoted column" = table.pivot(#"changed type", list.distinct(#"changed type"[column1]), "column1", "column3", list.sum)
in #"pivoted column",
source2 = folder.files("c:\temp5"),
// main code, retrieve all filenames, process them with the functions, then combine and expand them
#"filtered rows" = table.selectrows(source2, each ([extension] = ".pdf")),
#"added custom" = table.addcolumn(#"filtered rows", "data", each group([folder path]&[name])),
#"renamed columns" = table.renamecolumns(#"added custom",{{"name", "zname"}}),
#"expanded data" = table.expandtablecolumn(#"renamed columns", "data",{"name","custom"},{"name","custom"}),
list = list.union(list.transform(#"expanded data"[custom], each table.columnnames(_))),
#"expanded data2" = table.expandtablecolumn(#"expanded data", "custom", list,list),
#"changed type" = table.transformcolumntypes(#"expanded data2",{{"column4", type date}})
in #"changed type"
Bookmarks