{"id":5796,"date":"2011-10-06T09:28:58","date_gmt":"2011-10-06T06:28:58","guid":{"rendered":"http:\/\/hgpu.org\/?p=5796"},"modified":"2011-10-06T09:28:58","modified_gmt":"2011-10-06T06:28:58","slug":"static-gpu-threads-and-an-improved-scan-algorithm","status":"publish","type":"post","link":"https:\/\/hgpu.org\/?p=5796","title":{"rendered":"Static GPU threads and an improved scan algorithm"},"content":{"rendered":"<p>Current GPU programming systems automatically distribute the work on all GPU processors based on a set of fixed assumptions, e.g. that all tasks are independent from each other. We show that automatic distribution limits algorithmic design, and demonstrate that manual work distribution hardly adds any overhead. Our Scan+algorithm is an improved scan relying on manual work distribution. It uses global barriers and task interleaving to provides almost twice the performance of Apple&#8217;s reference implementation [1].<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Current GPU programming systems automatically distribute the work on all GPU processors based on a set of fixed assumptions, e.g. that all tasks are independent from each other. We show that automatic distribution limits algorithmic design, and demonstrate that manual work distribution hardly adds any overhead. Our Scan+algorithm is an improved scan relying on manual [&hellip;]<\/p>\n","protected":false},"author":351,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_jetpack_newsletter_access":"","_jetpack_dont_email_post_to_subs":false,"_jetpack_newsletter_tier_id":0,"_jetpack_memberships_contains_paywalled_content":false,"_jetpack_feature_clip_id":0,"_jetpack_memberships_contains_paid_content":false,"footnotes":"","jetpack_publicize_message":"","jetpack_publicize_feature_enabled":true,"jetpack_social_post_already_shared":false,"jetpack_social_options":{"image_generator_settings":{"template":"highway","default_image_id":0,"font":"","enabled":false},"version":2},"jetpack_post_was_ever_published":false},"categories":[36,11,89,90,3],"tags":[1787,1782,14,20,1181,1793,67,119,70],"class_list":["post-5796","post","type-post","status-publish","format-standard","hentry","category-algorithms","category-computer-science","category-nvidia-cuda","category-opencl","category-paper","tag-algorithms","tag-computer-science","tag-cuda","tag-nvidia","tag-nvidia-geforce-gt-280","tag-opencl","tag-performance","tag-presentation","tag-programming-techniques"],"views":2473,"jetpack_publicize_connections":[],"jetpack_featured_media_url":"","jetpack_sharing_enabled":true,"_links":{"self":[{"href":"https:\/\/hgpu.org\/index.php?rest_route=\/wp\/v2\/posts\/5796","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/hgpu.org\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/hgpu.org\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/hgpu.org\/index.php?rest_route=\/wp\/v2\/users\/351"}],"replies":[{"embeddable":true,"href":"https:\/\/hgpu.org\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=5796"}],"version-history":[{"count":0,"href":"https:\/\/hgpu.org\/index.php?rest_route=\/wp\/v2\/posts\/5796\/revisions"}],"wp:attachment":[{"href":"https:\/\/hgpu.org\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=5796"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/hgpu.org\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=5796"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/hgpu.org\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=5796"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}