[{"key":"dc.contributor.author","value":"Dehghani, Dorna","language":null},{"key":"dc.date.accessioned","value":"2026-04-15T18:29:36Z","language":null},{"key":"dc.date.available","value":"2026-04-15T18:29:36Z","language":null},{"key":"dc.date.issued","value":"2026","language":"en"},{"key":"dc.identifier.uri","value":"http:\/\/hdl.handle.net\/2429\/94079","language":null},{"key":"dc.description.abstract","value":"Environmental assessment and permitting processes generate large volumes of administrative data, typically stored in project-specific spreadsheets that record iterative comment\u2013response exchanges across multiple review rounds. Although these spreadsheets contain semantically similar information\u2014such as comment text, response text, dates, agencies, and review status\u2014their schemas, layouts, and value encodings vary across projects. This structural heterogeneity prevents systematic cross-project analysis and limits trans- parency, reproducibility, and scalability. This thesis formulates the integration of heterogeneous permitting spreadsheets as a schema matching and normalization problem and presents a deterministic end-to-end pipeline that transforms project-level Excel files into a unified long-format dataset. The pipeline combines column-level schema alignment with value-level normalization. Instruction-tuned large language models (LLMs) generate enriched column descriptions and support interpretable semantic equivalence decisions within a structured matching framework, while preserving auditability by serving as contextual representations rather than autonomous transformation logic. Following schema alignment, rule-based transformations standardize dates, agency names, compound identifiers, null placeholders, and wide-format review rounds, producing a consistent representation in which each record corresponds to a single comment\u2013response pair with associated metadata.\r\nEvaluation is conducted using a project-level train\u2013test split to assess generalization to previously unseen spreadsheet structures. Results indicate that LLM-assisted semantic matching combined with deterministic preprocessing enables robust schema alignment across heterogeneous files while maintaining interpretability and reproducibility. Question answering over the normalized dataset further demonstrates that integration reduces analytical effort for cross-project queries.\r\nThis work provides a practical and reproducible framework for structured data integration in regulatory and administrative contexts under local, single-GPU deployment constraints.","language":"en"},{"key":"dc.language.iso","value":"eng","language":"en"},{"key":"dc.publisher","value":"University of British Columbia","language":"en"},{"key":"dc.rights","value":"Attribution-NonCommercial-NoDerivatives 4.0 International","language":"*"},{"key":"dc.rights.uri","value":"http:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/","language":"*"},{"key":"dc.title","value":"Integrating heterogeneous environmental permitting spreadsheets through LLM-assisted schema matching and deterministic normalization","language":"en"},{"key":"dc.type","value":"Text","language":"en"},{"key":"dc.degree.name","value":"Master of Science - MSc","language":"en"},{"key":"dc.degree.discipline","value":"Computer Science","language":"en"},{"key":"dc.degree.grantor","value":"University of British Columbia","language":"en"},{"key":"dc.contributor.supervisor","value":"Lakshmanan, Laks V. S., 1959-","language":null},{"key":"dc.date.graduation","value":"2026-05","language":"en"},{"key":"dc.type.text","value":"Thesis\/Dissertation","language":"en"},{"key":"dc.description.affiliation","value":"Science, Faculty of","language":"en"},{"key":"dc.description.affiliation","value":"Computer Science, Department of","language":"en"},{"key":"dc.degree.campus","value":"UBCV","language":"en"},{"key":"dc.description.scholarlevel","value":"Graduate","language":"en"}]