[{"key":"dc.contributor.author","value":"Chua, Martin","language":null},{"key":"dc.date.accessioned","value":"2026-04-09T22:54:44Z","language":null},{"key":"dc.date.available","value":"2026-04-09T22:54:44Z","language":null},{"key":"dc.date.issued","value":"2026","language":"en"},{"key":"dc.identifier.uri","value":"http:\/\/hdl.handle.net\/2429\/93972","language":null},{"key":"dc.description.abstract","value":"Hardware acceleration for machine learning applications has become increasingly important as models grow and evolve rapidly. FPGAs are able to adapt to these changes quickly because they are hardware reconfigurable while also providing low latency, high throughput, and efficiency. The efficiency of machine learning acceleration is intrinsically tied to memory access latency, capacity, and bandwidth. On an FPGA, fine-grained resources like flip-flops and LUTRAMs provide lower\r\nlatency access but offer limited storage capacity. Dedicated on-chip BRAMs provide higher density but is still finite. Off-chip DRAM suffers from increased latency and constrained bandwidth, which limits throughput of model training and inference.\r\nPrior work has proposed an architectural enhancement that allows the user to re-purpose unused configuration bits as user-accessible memory. In typical FPGA implementations, there remains a significant portion of routing segments are left unused. By modifying the switch block architecture, the configuration bits controlling unused segments can be implemented as user storage. Inspired by this work and the growing demand for machine learning acceleration, we present three research contributions.\r\nThe first contribution is an FPGA architecture enhancement, called switch block memory, that allows the user to re-purpose unused FPGA switch block configuration bits to implement weight memory in machine learning applications. The second contribution is a comprehensive analysis of machine learning memory utilization to identify the specific contexts where our switch block memories is most effective. The third contribution is an augmented CAD flow integrated into the\r\nopen-source VTR CAD suite to evaluate the proposed architecture. When applied to selected machine learning workloads, our approach achieves up to a 9% improvement in Fmax, a 3% reduction in total wire length, and enables up to 80 Mb of additional on-chip storage for large FPGA devices.","language":"en"},{"key":"dc.language.iso","value":"eng","language":"en"},{"key":"dc.publisher","value":"University of British Columbia","language":"en"},{"key":"dc.rights","value":"Attribution-NonCommercial-NoDerivatives 4.0 International","language":"*"},{"key":"dc.rights.uri","value":"http:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/","language":"*"},{"key":"dc.title","value":"An FPGA memory architecture to enable efficient weight implementations for machine learning applications","language":"en"},{"key":"dc.type","value":"Text","language":"en"},{"key":"dc.degree.name","value":"Master of Applied Science - MASc","language":"en"},{"key":"dc.degree.discipline","value":"Electrical and Computer Engineering","language":"en"},{"key":"dc.degree.grantor","value":"University of British Columbia","language":"en"},{"key":"dc.contributor.supervisor","value":"Wilton, Steve","language":null},{"key":"dc.date.graduation","value":"2026-05","language":"en"},{"key":"dc.type.text","value":"Thesis\/Dissertation","language":"en"},{"key":"dc.description.affiliation","value":"Applied Science, Faculty of","language":"en"},{"key":"dc.description.affiliation","value":"Electrical and Computer Engineering, Department of","language":"en"},{"key":"dc.degree.campus","value":"UBCV","language":"en"},{"key":"dc.description.scholarlevel","value":"Graduate","language":"en"}]