{"metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:00018204","sets":["934:1119:1124:1125"]},"path":["1125"],"owner":"1","recid":"18204","title":["Level-3 BLAS and LU Factorization on a Matrix Processor"],"pubdate":{"attribute_name":"公開日","attribute_value":"2008-03-15"},"_buckets":{"deposit":"02ad8f1b-80e5-4647-b9b5-203f8314d302"},"_deposit":{"id":"18204","pid":{"type":"depid","value":"18204","revision_id":0},"owners":[1],"status":"published","created_by":1},"item_title":"Level-3 BLAS and LU Factorization on a Matrix Processor","author_link":["0","0"],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"Level-3 BLAS and LU Factorization on a Matrix Processor"},{"subitem_title":"Level-3 BLAS and LU Factorization on a Matrix Processor","subitem_title_language":"en"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"数値計算","subitem_subject_scheme":"Other"}]},"item_type_id":"3","publish_date":"2008-03-15","item_3_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"Department of Information Systems  The University of Aizu"},{"subitem_text_value":"Department of Information Systems  The University of Aizu"}]},"item_3_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"Department of Information Systems, The University of Aizu","subitem_text_language":"en"},{"subitem_text_value":"Department of Information Systems, The University of Aizu","subitem_text_language":"en"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"eng"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/18204/files/IPSJ-TACS4902005.pdf"},"date":[{"dateType":"Available","dateValue":"2010-03-15"}],"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-TACS4902005.pdf","filesize":[{"value":"1.1 MB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"660","billingrole":"5"},{"tax":["include_tax"],"price":"330","billingrole":"6"},{"tax":["include_tax"],"price":"0","billingrole":"16"},{"tax":["include_tax"],"price":"0","billingrole":"11"},{"tax":["include_tax"],"price":"0","billingrole":"14"},{"tax":["include_tax"],"price":"0","billingrole":"15"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_date","version_id":"932cfd86-7c4f-44cb-ba0e-67d03075a65d","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2008 by the Information Processing Society of Japan"}]},"item_3_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"AhmedS.Zekri"},{"creatorName":"StanislavG.Sedukhin"}],"nameIdentifiers":[{}]}]},"item_3_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Ahmed, S.Zekri","creatorNameLang":"en"},{"creatorName":"Stanislav, G.Sedukhin","creatorNameLang":"en"}],"nameIdentifiers":[{}]}]},"item_3_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AA11833852","subitem_source_identifier_type":"NCID"}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_6501","resourcetype":"journal article"}]},"item_3_source_id_11":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"1882-7829","subitem_source_identifier_type":"ISSN"}]},"item_3_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"As increasing clock frequency approaches its physical limits  a good approach to enhance performance is to increase parallelism by integrating more cores as coprocessors to generalpurpose processors in order to handle the different workloads in scientific  engineering  and signal processing applications. In this paper  we propose a many-core matrix processor model consisting of a scalar unit augmented with b×b simple cores tightly connected in a 2D torus matrix unit to accelerate matrix-based kernels. Data load/store is overlapped with computing using a decoupled data access unit that moves b×b blocks of data between memory and the two scalar and matrix processing units. The operation of the matrix unit is mainly processing fine-grained b×b matrix multiply-add (MMA) operations. We formulate the data alignment operations including matrix transposition and skewing as MMA operations in order to overlap them with data load/store. Two fundamental linear algebra algorithms are designed and analytically evaluated on the proposed matrix processor: the Level-3 BLAS kernel  GEMM  and the LU factorization with partial pivoting  the main step in solving linear systems of equations.For the GEMM kernel  the maximum speed of computing measured in FLOPs/cycle is approached for different matrix sizes  n  and block sizes  b. The speed of the LU factorization for relatively large values of n ranges from around 50?90% of the maximum speed depending on the model parameters. Overall  the analytical results show the merits of using the matrix unit for accelerating the matrix-based applications.","subitem_description_type":"Other"}]},"item_3_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"As increasing clock frequency approaches its physical limits, a good approach to enhance performance is to increase parallelism by integrating more cores as coprocessors to generalpurpose processors in order to handle the different workloads in scientific, engineering, and signal processing applications. In this paper, we propose a many-core matrix processor model consisting of a scalar unit augmented with b×b simple cores tightly connected in a 2D torus matrix unit to accelerate matrix-based kernels. Data load/store is overlapped with computing using a decoupled data access unit that moves b×b blocks of data between memory and the two scalar and matrix processing units. The operation of the matrix unit is mainly processing fine-grained b×b matrix multiply-add (MMA) operations. We formulate the data alignment operations including matrix transposition and skewing as MMA operations in order to overlap them with data load/store. Two fundamental linear algebra algorithms are designed and analytically evaluated on the proposed matrix processor: the Level-3 BLAS kernel, GEMM, and the LU factorization with partial pivoting, the main step in solving linear systems of equations.For the GEMM kernel, the maximum speed of computing measured in FLOPs/cycle is approached for different matrix sizes, n, and block sizes, b. The speed of the LU factorization for relatively large values of n ranges from around 50窶骭90% of the maximum speed depending on the model parameters. Overall, the analytical results show the merits of using the matrix unit for accelerating the matrix-based applications.","subitem_description_type":"Other"}]},"item_3_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"52","bibliographic_titles":[{"bibliographic_title":"情報処理学会論文誌コンピューティングシステム（ACS）"}],"bibliographicPageStart":"37","bibliographicIssueDates":{"bibliographicIssueDate":"2008-03-15","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"SIG2(ACS21)","bibliographicVolumeNumber":"49"}]},"relation_version_is_last":true,"weko_creator_id":"1"},"id":18204,"updated":"2025-01-22T22:54:43.713731+00:00","links":{},"created":"2025-01-18T22:50:59.963584+00:00"}