more svg plots
#18
by
						
thomwolf
	
							HF Staff
						- opened
							
					
- assets/data/benchmarks/cp_8Bmemoryusage.html +2 -0
- assets/data/benchmarks/pp_comm_bandwidth.html +2 -0
- assets/data/benchmarks/tp_memoryusage.html +2 -0
- assets/data/benchmarks/tp_sp_memoryusage.html +2 -0
- dist/assets/data/benchmarks/cp_8Bmemoryusage.html +2 -0
- dist/assets/data/benchmarks/pp_comm_bandwidth.html +2 -0
- dist/assets/data/benchmarks/tp_memoryusage.html +2 -0
- dist/assets/data/benchmarks/tp_sp_memoryusage.html +2 -0
- dist/index.html +42 -7
- src/index.html +42 -7
    	
        assets/data/benchmarks/cp_8Bmemoryusage.html
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            <div>                        <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
         | 
| 2 | 
            +
                    <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script>                <div id="01408284-93dd-45c6-ba13-d84a152015d9" class="plotly-graph-div" style="height:410px; width:1000px;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("01408284-93dd-45c6-ba13-d84a152015d9")) {                    Plotly.newPlot(                        "01408284-93dd-45c6-ba13-d84a152015d9",                        [{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":true,"x":["1024","4096","16384","65536","131072"],"y":[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":true,"x":["1024","4096","16384","65536","131072"],"y":[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":true,"x":["1024","4096","16384","65536","131072"],"y":[59.828125,59.828125,59.828125,59.828125,59.828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":true,"x":["1024","4096","16384","65536","131072"],"y":[4.25,17.0,68.0,272.0,544.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[29.9140625,29.9140625,29.9140625,29.9140625,29.9140625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[2.75,11.0,44.0,176.0,352.0],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[29.9140625,29.9140625,29.9140625,29.9140625,29.9140625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[0.6875,2.75,11.0,44.0,88.0],"type":"bar","xaxis":"x3","yaxis":"y3"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray","title":{"text":"Memory Usage (GB)"}},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"No Parallelism","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=2 CP=1","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=2 CP=4","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"shapes":[{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x domain","y0":80,"y1":80,"yref":"y"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x2 domain","y0":80,"y1":80,"yref":"y2"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x3 domain","y0":80,"y1":80,"yref":"y3"}],"title":{"text":"Memory Usage for 8B Model"},"legend":{"orientation":"v","x":1.02,"y":0.5},"margin":{"r":150},"barmode":"stack","width":1000,"height":410},                        {"responsive": true, "scrollZoom": false}                    )                };                            </script>        </div>
         | 
    	
        assets/data/benchmarks/pp_comm_bandwidth.html
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            <div>                        <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
         | 
| 2 | 
            +
                    <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script>                <div id="dfed5d64-b526-45dc-ac6f-55476059857c" class="plotly-graph-div" style="height:410px; width:1000px;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("dfed5d64-b526-45dc-ac6f-55476059857c")) {                    Plotly.newPlot(                        "dfed5d64-b526-45dc-ac6f-55476059857c",                        [{"fill":"toself","fillcolor":"rgba(78,165,183,0.2)","hoverinfo":"skip","line":{"color":"rgba(255,255,255,0)"},"showlegend":false,"x":[0,1,2,3,4,5,6,6,5,4,3,2,1,0],"y":[461.538,436.13,219.09099999999998,192.73399999999992,188.42249999999999,194.227,177.35999999999999,12.493500000000001,24.965,31.34,41.587,44.509,302.33500000000004,422.288],"type":"scatter"},{"line":{"color":"#4ea5b7","width":3},"marker":{"size":10,"symbol":"circle"},"mode":"lines+markers+text","name":"AllReduce","text":["436.0","361.7","160.1","99.6","84.7","64.9","32.9"],"textposition":"bottom center","x":[0,1,2,3,4,5,6],"y":[435.9668115942029,361.74920529801324,160.13950738916256,99.56427561837455,84.74052884615384,64.92543661971831,32.937847222222224],"type":"scatter"},{"fill":"toself","fillcolor":"rgba(232,137,171,0.2)","hoverinfo":"skip","line":{"color":"rgba(255,255,255,0)"},"showlegend":false,"x":[0,1,2,3,4,5,6,6,5,4,3,2,1,0],"y":[264.93,226.26999999999998,229.40999999999997,178.47899999999998,126.6575,77.026,44.4165,6.1535,12.314,24.525000000000002,47.147,40.757000000000005,147.97500000000002,239.55200000000002],"type":"scatter"},{"line":{"color":"#e889ab","width":3},"marker":{"size":10,"symbol":"square"},"mode":"lines+markers","name":"AllGather","x":[0,1,2,3,4,5,6],"y":[249.84884057971013,184.61324503311258,118.96753694581281,68.99752650176679,54.972283653846155,27.969183098591547,11.038298611111111],"type":"scatter"},{"fill":"toself","fillcolor":"rgba(206,192,250,0.2)","hoverinfo":"skip","line":{"color":"rgba(255,255,255,0)"},"showlegend":false,"x":[0,1,2,3,4,5,6,6,5,4,3,2,1,0],"y":[264.64599999999996,226.37,215.492,177.54299999999998,126.4825,77.289,45.1295,6.1005,12.39,24.544999999999998,46.802,41.176,146.1,240.804],"type":"scatter"},{"line":{"color":"#cec0fa","width":3},"marker":{"size":10,"symbol":"triangle-up"},"mode":"lines+markers","name":"ReduceScatter","x":[0,1,2,3,4,5,6],"y":[249.72898550724636,181.5535761589404,115.7576354679803,68.55106007067138,54.524230769230776,27.944281690140844,11.069652777777778],"type":"scatter"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"title":{"text":"Number of Nodes"},"tickmode":"array","tickvals":[0,1,2,3,4,5,6],"ticktext":["1","2","4","8","16","32","64"]},"yaxis":{"title":{"text":"Bandwidth (GB\u002fs)"},"range":[0,480.0],"gridcolor":"rgba(0,0,0,0.1)"},"legend":{"x":0.85,"y":1,"bgcolor":"rgba(255,255,255,0.5)"},"margin":{"l":80,"r":80,"t":80,"b":80},"title":{"text":"Communication Bandwidth by Number of Nodes (size=256MB)"},"width":1000,"height":410},                        {"responsive": true, "scrollZoom": false}                    )                };                            </script>        </div>
         | 
    	
        assets/data/benchmarks/tp_memoryusage.html
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            <div>                        <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
         | 
| 2 | 
            +
                    <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script>                <div id="91ff1fb3-039d-4bf4-88dc-5ca61d350f27" class="plotly-graph-div" style="height:400px; width:1000px;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("91ff1fb3-039d-4bf4-88dc-5ca61d350f27")) {                    Plotly.newPlot(                        "91ff1fb3-039d-4bf4-88dc-5ca61d350f27",                        [{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":true,"x":["1024","4096","16384"],"y":[131.5,131.5,131.5],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":true,"x":["1024","4096","16384"],"y":[131.5,131.5,131.5],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":true,"x":["1024","4096","16384"],"y":[526.0,526.0,526.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":true,"x":["1024","4096","16384"],"y":[21.25,85.0,340.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[16.4375,16.4375,16.4375],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[16.4375,16.4375,16.4375],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[65.75,65.75,65.75],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[8.125,32.5,130.0],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[8.21875,8.21875,8.21875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[8.21875,8.21875,8.21875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[32.875,32.875,32.875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[7.1875,28.75,115.0],"type":"bar","xaxis":"x3","yaxis":"y3"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"range":[0,150],"dtick":20,"title":{"text":"Memory Usage (GB)"},"showgrid":true,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"No Parallelism (TP-1)","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=8","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=16","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"shapes":[{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x domain","y0":80,"y1":80,"yref":"y"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x2 domain","y0":80,"y1":80,"yref":"y2"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x3 domain","y0":80,"y1":80,"yref":"y3"}],"title":{"text":"Memory Usage for 70B Model"},"legend":{"orientation":"v","x":1.02,"y":0.5},"margin":{"r":150},"barmode":"stack","width":1000,"height":400},                        {"responsive": true, "scrollZoom": false}                    )                };                            </script>        </div>
         | 
    	
        assets/data/benchmarks/tp_sp_memoryusage.html
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            <div>                        <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
         | 
| 2 | 
            +
                    <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script>                <div id="f8efa5bf-d41e-41c6-98ab-34baf4260090" class="plotly-graph-div" style="height:410px; width:1000px;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("f8efa5bf-d41e-41c6-98ab-34baf4260090")) {                    Plotly.newPlot(                        "f8efa5bf-d41e-41c6-98ab-34baf4260090",                        [{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":true,"x":["1024","4096","16384"],"y":[131.5,131.5,131.5],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[16.4375,16.4375,16.4375],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[8.21875,8.21875,8.21875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":true,"x":["1024","4096","16384"],"y":[131.5,131.5,131.5],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[16.4375,16.4375,16.4375],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[8.21875,8.21875,8.21875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":true,"x":["1024","4096","16384"],"y":[526.0,526.0,526.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[65.75,65.75,65.75],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[32.875,32.875,32.875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":true,"x":["1024","4096","16384"],"y":[21.25,85.0,340.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[2.65625,10.625,42.5],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[1.328125,5.3125,21.25],"type":"bar","xaxis":"x3","yaxis":"y3"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray","title":{"text":"Memory Usage (GB)"}},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"No Parallelism","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=8 (with SP)","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=16 (with SP)","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"shapes":[{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x domain","y0":80,"y1":80,"yref":"y"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x2 domain","y0":80,"y1":80,"yref":"y2"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x3 domain","y0":80,"y1":80,"yref":"y3"}],"title":{"text":"Memory Usage for 70B Model"},"legend":{"orientation":"v","x":1.02,"y":0.5},"margin":{"r":150},"barmode":"stack","width":1000,"height":410},                        {"responsive": true, "scrollZoom": false}                    )                };                            </script>        </div>
         | 
    	
        dist/assets/data/benchmarks/cp_8Bmemoryusage.html
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            <div>                        <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
         | 
| 2 | 
            +
                    <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script>                <div id="01408284-93dd-45c6-ba13-d84a152015d9" class="plotly-graph-div" style="height:410px; width:1000px;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("01408284-93dd-45c6-ba13-d84a152015d9")) {                    Plotly.newPlot(                        "01408284-93dd-45c6-ba13-d84a152015d9",                        [{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":true,"x":["1024","4096","16384","65536","131072"],"y":[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":true,"x":["1024","4096","16384","65536","131072"],"y":[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":true,"x":["1024","4096","16384","65536","131072"],"y":[59.828125,59.828125,59.828125,59.828125,59.828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":true,"x":["1024","4096","16384","65536","131072"],"y":[4.25,17.0,68.0,272.0,544.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[29.9140625,29.9140625,29.9140625,29.9140625,29.9140625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[2.75,11.0,44.0,176.0,352.0],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[29.9140625,29.9140625,29.9140625,29.9140625,29.9140625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384","65536","131072"],"y":[0.6875,2.75,11.0,44.0,88.0],"type":"bar","xaxis":"x3","yaxis":"y3"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray","title":{"text":"Memory Usage (GB)"}},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"No Parallelism","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=2 CP=1","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=2 CP=4","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"shapes":[{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x domain","y0":80,"y1":80,"yref":"y"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x2 domain","y0":80,"y1":80,"yref":"y2"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x3 domain","y0":80,"y1":80,"yref":"y3"}],"title":{"text":"Memory Usage for 8B Model"},"legend":{"orientation":"v","x":1.02,"y":0.5},"margin":{"r":150},"barmode":"stack","width":1000,"height":410},                        {"responsive": true, "scrollZoom": false}                    )                };                            </script>        </div>
         | 
    	
        dist/assets/data/benchmarks/pp_comm_bandwidth.html
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            <div>                        <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
         | 
| 2 | 
            +
                    <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script>                <div id="dfed5d64-b526-45dc-ac6f-55476059857c" class="plotly-graph-div" style="height:410px; width:1000px;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("dfed5d64-b526-45dc-ac6f-55476059857c")) {                    Plotly.newPlot(                        "dfed5d64-b526-45dc-ac6f-55476059857c",                        [{"fill":"toself","fillcolor":"rgba(78,165,183,0.2)","hoverinfo":"skip","line":{"color":"rgba(255,255,255,0)"},"showlegend":false,"x":[0,1,2,3,4,5,6,6,5,4,3,2,1,0],"y":[461.538,436.13,219.09099999999998,192.73399999999992,188.42249999999999,194.227,177.35999999999999,12.493500000000001,24.965,31.34,41.587,44.509,302.33500000000004,422.288],"type":"scatter"},{"line":{"color":"#4ea5b7","width":3},"marker":{"size":10,"symbol":"circle"},"mode":"lines+markers+text","name":"AllReduce","text":["436.0","361.7","160.1","99.6","84.7","64.9","32.9"],"textposition":"bottom center","x":[0,1,2,3,4,5,6],"y":[435.9668115942029,361.74920529801324,160.13950738916256,99.56427561837455,84.74052884615384,64.92543661971831,32.937847222222224],"type":"scatter"},{"fill":"toself","fillcolor":"rgba(232,137,171,0.2)","hoverinfo":"skip","line":{"color":"rgba(255,255,255,0)"},"showlegend":false,"x":[0,1,2,3,4,5,6,6,5,4,3,2,1,0],"y":[264.93,226.26999999999998,229.40999999999997,178.47899999999998,126.6575,77.026,44.4165,6.1535,12.314,24.525000000000002,47.147,40.757000000000005,147.97500000000002,239.55200000000002],"type":"scatter"},{"line":{"color":"#e889ab","width":3},"marker":{"size":10,"symbol":"square"},"mode":"lines+markers","name":"AllGather","x":[0,1,2,3,4,5,6],"y":[249.84884057971013,184.61324503311258,118.96753694581281,68.99752650176679,54.972283653846155,27.969183098591547,11.038298611111111],"type":"scatter"},{"fill":"toself","fillcolor":"rgba(206,192,250,0.2)","hoverinfo":"skip","line":{"color":"rgba(255,255,255,0)"},"showlegend":false,"x":[0,1,2,3,4,5,6,6,5,4,3,2,1,0],"y":[264.64599999999996,226.37,215.492,177.54299999999998,126.4825,77.289,45.1295,6.1005,12.39,24.544999999999998,46.802,41.176,146.1,240.804],"type":"scatter"},{"line":{"color":"#cec0fa","width":3},"marker":{"size":10,"symbol":"triangle-up"},"mode":"lines+markers","name":"ReduceScatter","x":[0,1,2,3,4,5,6],"y":[249.72898550724636,181.5535761589404,115.7576354679803,68.55106007067138,54.524230769230776,27.944281690140844,11.069652777777778],"type":"scatter"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"title":{"text":"Number of Nodes"},"tickmode":"array","tickvals":[0,1,2,3,4,5,6],"ticktext":["1","2","4","8","16","32","64"]},"yaxis":{"title":{"text":"Bandwidth (GB\u002fs)"},"range":[0,480.0],"gridcolor":"rgba(0,0,0,0.1)"},"legend":{"x":0.85,"y":1,"bgcolor":"rgba(255,255,255,0.5)"},"margin":{"l":80,"r":80,"t":80,"b":80},"title":{"text":"Communication Bandwidth by Number of Nodes (size=256MB)"},"width":1000,"height":410},                        {"responsive": true, "scrollZoom": false}                    )                };                            </script>        </div>
         | 
    	
        dist/assets/data/benchmarks/tp_memoryusage.html
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            <div>                        <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
         | 
| 2 | 
            +
                    <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script>                <div id="91ff1fb3-039d-4bf4-88dc-5ca61d350f27" class="plotly-graph-div" style="height:400px; width:1000px;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("91ff1fb3-039d-4bf4-88dc-5ca61d350f27")) {                    Plotly.newPlot(                        "91ff1fb3-039d-4bf4-88dc-5ca61d350f27",                        [{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":true,"x":["1024","4096","16384"],"y":[131.5,131.5,131.5],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":true,"x":["1024","4096","16384"],"y":[131.5,131.5,131.5],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":true,"x":["1024","4096","16384"],"y":[526.0,526.0,526.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":true,"x":["1024","4096","16384"],"y":[21.25,85.0,340.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[16.4375,16.4375,16.4375],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[16.4375,16.4375,16.4375],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[65.75,65.75,65.75],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[8.125,32.5,130.0],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[8.21875,8.21875,8.21875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[8.21875,8.21875,8.21875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[32.875,32.875,32.875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[7.1875,28.75,115.0],"type":"bar","xaxis":"x3","yaxis":"y3"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"range":[0,150],"dtick":20,"title":{"text":"Memory Usage (GB)"},"showgrid":true,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"No Parallelism (TP-1)","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=8","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=16","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"shapes":[{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x domain","y0":80,"y1":80,"yref":"y"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x2 domain","y0":80,"y1":80,"yref":"y2"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x3 domain","y0":80,"y1":80,"yref":"y3"}],"title":{"text":"Memory Usage for 70B Model"},"legend":{"orientation":"v","x":1.02,"y":0.5},"margin":{"r":150},"barmode":"stack","width":1000,"height":400},                        {"responsive": true, "scrollZoom": false}                    )                };                            </script>        </div>
         | 
    	
        dist/assets/data/benchmarks/tp_sp_memoryusage.html
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            <div>                        <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
         | 
| 2 | 
            +
                    <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script>                <div id="f8efa5bf-d41e-41c6-98ab-34baf4260090" class="plotly-graph-div" style="height:410px; width:1000px;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("f8efa5bf-d41e-41c6-98ab-34baf4260090")) {                    Plotly.newPlot(                        "f8efa5bf-d41e-41c6-98ab-34baf4260090",                        [{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":true,"x":["1024","4096","16384"],"y":[131.5,131.5,131.5],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[16.4375,16.4375,16.4375],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Model Parameters","marker":{"color":"#4ea5b7"},"name":"Model Parameters","showlegend":false,"x":["1024","4096","16384"],"y":[8.21875,8.21875,8.21875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":true,"x":["1024","4096","16384"],"y":[131.5,131.5,131.5],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[16.4375,16.4375,16.4375],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Gradients","marker":{"color":"#e889ab"},"name":"Gradients","showlegend":false,"x":["1024","4096","16384"],"y":[8.21875,8.21875,8.21875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":true,"x":["1024","4096","16384"],"y":[526.0,526.0,526.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[65.75,65.75,65.75],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Optimizer States","marker":{"color":"#cec0fa"},"name":"Optimizer States","showlegend":false,"x":["1024","4096","16384"],"y":[32.875,32.875,32.875],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":true,"x":["1024","4096","16384"],"y":[21.25,85.0,340.0],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[2.65625,10.625,42.5],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"Activations","marker":{"color":"#e38a42"},"name":"Activations","showlegend":false,"x":["1024","4096","16384"],"y":[1.328125,5.3125,21.25],"type":"bar","xaxis":"x3","yaxis":"y3"}],                        {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray","title":{"text":"Memory Usage (GB)"}},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"title":{"text":"Sequence Length"},"showgrid":true,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"matches":"y","showticklabels":false,"range":[0,150],"dtick":20,"showgrid":true,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"No Parallelism","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=8 (with SP)","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"TP=16 (with SP)","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"shapes":[{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x domain","y0":80,"y1":80,"yref":"y"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x2 domain","y0":80,"y1":80,"yref":"y2"},{"line":{"color":"red","dash":"dash"},"type":"line","x0":0,"x1":1,"xref":"x3 domain","y0":80,"y1":80,"yref":"y3"}],"title":{"text":"Memory Usage for 70B Model"},"legend":{"orientation":"v","x":1.02,"y":0.5},"margin":{"r":150},"barmode":"stack","width":1000,"height":410},                        {"responsive": true, "scrollZoom": false}                    )                };                            </script>        </div>
         | 
    	
        dist/index.html
    CHANGED
    
    | @@ -919,7 +919,15 @@ | |
| 919 |  | 
| 920 | 
             
                    <p>However, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
         | 
| 921 |  | 
| 922 | 
            -
                    < | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 923 |  | 
| 924 | 
             
                    <p>As we can see, increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU. While tensor parallelism does help reduce activation memory in attention and feedforward layers by sharding the matrix multiplications across GPUs, we don't get the full memory benefits we could. This is because operations like layer normalization and dropout still require gathering the full activations on each GPU, partially negating the memory savings. We can do better by finding ways to parallelize these remaining operations as well.</p>
         | 
| 925 |  | 
| @@ -1068,7 +1076,15 @@ | |
| 1068 |  | 
| 1069 | 
             
                    <p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
         | 
| 1070 |  | 
| 1071 | 
            -
                    < | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1072 |  | 
| 1073 | 
             
                    <p>Does that mean that SP incurs more communication than TP? Well, yes and no. In the forward of a vanilla TP we had two all-reduce per transformer block, and in SP we have two all-gather and two reduce-scatter per transformer block. So SP does twice the number of communication operations as TP. But since an all-reduce operation can be broken down into to an all-gather + reduce-scatter (see in [TODO: Appendix link]) they’re actually equivalent in terms of communication. Same reasoning for backward as we just use the conjugate of each operation (no-op ↔ allreduce and allgather ↔ reducescatter).</p>
         | 
| 1074 |  | 
| @@ -1079,9 +1095,10 @@ | |
| 1079 | 
             
                    <p>Besides the fact that TP requires communications in each layer, it also can’t easily be overlapped with compute, which makes throughput heavily dependent on the communication bandwidth. This is why TP is usually done only within a node (TP≤8).</p>
         | 
| 1080 |  | 
| 1081 |  | 
| 1082 | 
            -
                    <aside>Overlapping communication with computation for TP is an active area of research, with recent work like Domino <d-cite bibtex-key="wang2024domino"></d-cite> exploring novel techniques to maximize this overlap.  | 
| 1083 |  | 
| 1084 | 
             
                    <p>As you might expect, this communication overhead becomes increasingly problematic as we scale up tensor parallelism. To illustrate this, let’s check throughput as we scale TP with SP for a 3B model:</p>
         | 
|  | |
| 1085 |  | 
| 1086 | 
             
                    <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/tp_sp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
         | 
| 1087 | 
             
                    <script>
         | 
| @@ -1122,7 +1139,16 @@ | |
| 1122 |  | 
| 1123 | 
             
                    <p>Even if we use full recomputation of the activations, which comes at a heavy compute overhead (30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length:</p>
         | 
| 1124 |  | 
| 1125 | 
            -
                    < | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1126 |  | 
| 1127 | 
             
                    <p>Can we apply similar ideas to our sequence parallelism approach but inside in the modules where we apply Tensor Parallelism already, thereby also reducing the effect of sequence length? Yes, it’s time to talk about Context Parallelism, which you will find quite intuitive after all we’ve already convered.</p>
         | 
| 1128 |  | 
| @@ -1130,8 +1156,8 @@ | |
| 1130 |  | 
| 1131 | 
             
                    <p>The idea of Context Parallelism is quite simple; just like Sequence Parallelism, we’ll split the input along the sequence dimension but we now apply this splitting along the full model, instead of only the sequence parallel regions of the model as we’ve done previous with Tensor + Sequence Parallelism.</p>
         | 
| 1132 |  | 
| 1133 | 
            -
                    <p><img alt="cp_8Bmemoryusage.svg" src="/assets/images/cp_8Bmemoryusage.svg" /></p>
         | 
| 1134 | 
            -
             | 
| 1135 | 
             
                    <p>Splitting the sequence doesn't affect most modules like MLP and LayerNorm, where each token is processed independently. It also doesn’t require expensive communication like TP, as only the inputs are split and not the weight matrices. Just like data parallelism, after computing the gradients, an all-reduce operation is initiated to synchronize the gradients across the context parallelism group.</p>
         | 
| 1136 |  | 
| 1137 | 
             
                    <p>There is one important exception though, which is the <strong><em>attention module</em></strong>. In this module each token needs to access key/value pairs from <strong>all</strong> other sequence tokens or in the case of causal attention at least attends to each previous token.</p>
         | 
| @@ -1212,7 +1238,16 @@ | |
| 1212 |  | 
| 1213 | 
             
                    <p>In the TP section we saw that if we try to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) we hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we perform it across several nodes:</p>
         | 
| 1214 |  | 
| 1215 | 
            -
                    < | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1216 | 
             
                    <p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
         | 
| 1217 |  | 
| 1218 | 
             
                    <p>Sequence and context parallelism can help for long sequences but don’t help much if sequence length is not the root cause of our memory issues but rather the size of the model itself. For large model (70B+), the size of the weights alone can already push past the limits of the 4-8 GPUs on a single node. We can solve this issue by summoning the fourth (and last) parallelism dimension: “pipeline parallelism”.</p>
         | 
|  | |
| 919 |  | 
| 920 | 
             
                    <p>However, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
         | 
| 921 |  | 
| 922 | 
            +
                    <iframe class="l-body-outset" id="plotFrame7" src="assets/data/benchmarks/tp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
         | 
| 923 | 
            +
                    <script>
         | 
| 924 | 
            +
                        window.addEventListener('load', function() {
         | 
| 925 | 
            +
                            const frame = document.getElementById('plotFrame7');
         | 
| 926 | 
            +
                            frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
         | 
| 927 | 
            +
                            frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
         | 
| 928 | 
            +
                        });
         | 
| 929 | 
            +
                    </script>
         | 
| 930 | 
            +
                    <!-- <p><img alt="tp_memoryusage.svg" src="/assets/images/tp_memoryusage.svg" /></p> -->
         | 
| 931 |  | 
| 932 | 
             
                    <p>As we can see, increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU. While tensor parallelism does help reduce activation memory in attention and feedforward layers by sharding the matrix multiplications across GPUs, we don't get the full memory benefits we could. This is because operations like layer normalization and dropout still require gathering the full activations on each GPU, partially negating the memory savings. We can do better by finding ways to parallelize these remaining operations as well.</p>
         | 
| 933 |  | 
|  | |
| 1076 |  | 
| 1077 | 
             
                    <p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
         | 
| 1078 |  | 
| 1079 | 
            +
                    <iframe class="l-body-outset" id="plotFrame8" src="assets/data/benchmarks/tp_sp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
         | 
| 1080 | 
            +
                    <script>
         | 
| 1081 | 
            +
                        window.addEventListener('load', function() {
         | 
| 1082 | 
            +
                            const frame = document.getElementById('plotFrame8');
         | 
| 1083 | 
            +
                            frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
         | 
| 1084 | 
            +
                            frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
         | 
| 1085 | 
            +
                        });
         | 
| 1086 | 
            +
                    </script>
         | 
| 1087 | 
            +
                    <!-- <p><img alt="tp_sp_memoryusage.svg" src="/assets/images/tp_sp_memoryusage.svg" /></p> -->
         | 
| 1088 |  | 
| 1089 | 
             
                    <p>Does that mean that SP incurs more communication than TP? Well, yes and no. In the forward of a vanilla TP we had two all-reduce per transformer block, and in SP we have two all-gather and two reduce-scatter per transformer block. So SP does twice the number of communication operations as TP. But since an all-reduce operation can be broken down into to an all-gather + reduce-scatter (see in [TODO: Appendix link]) they’re actually equivalent in terms of communication. Same reasoning for backward as we just use the conjugate of each operation (no-op ↔ allreduce and allgather ↔ reducescatter).</p>
         | 
| 1090 |  | 
|  | |
| 1095 | 
             
                    <p>Besides the fact that TP requires communications in each layer, it also can’t easily be overlapped with compute, which makes throughput heavily dependent on the communication bandwidth. This is why TP is usually done only within a node (TP≤8).</p>
         | 
| 1096 |  | 
| 1097 |  | 
| 1098 | 
            +
                    <aside>Overlapping communication with computation for TP is an active area of research, with recent work like Domino <d-cite bibtex-key="wang2024domino"></d-cite> exploring novel techniques to maximize this overlap. </aside>
         | 
| 1099 |  | 
| 1100 | 
             
                    <p>As you might expect, this communication overhead becomes increasingly problematic as we scale up tensor parallelism. To illustrate this, let’s check throughput as we scale TP with SP for a 3B model:</p>
         | 
| 1101 | 
            +
                    <aside> For example, Megatron-LM/Nanotron implement a partial overlapping of all-gather with FC1 computation, and we expect to see more innovations in this space as the field continues to evolve.</aside>
         | 
| 1102 |  | 
| 1103 | 
             
                    <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/tp_sp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
         | 
| 1104 | 
             
                    <script>
         | 
|  | |
| 1139 |  | 
| 1140 | 
             
                    <p>Even if we use full recomputation of the activations, which comes at a heavy compute overhead (30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length:</p>
         | 
| 1141 |  | 
| 1142 | 
            +
                    <iframe class="l-body-outset" id="plotFrame9" src="assets/data/benchmarks/cp_8Bmemoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
         | 
| 1143 | 
            +
                    <script>
         | 
| 1144 | 
            +
                        window.addEventListener('load', function() {
         | 
| 1145 | 
            +
                            const frame = document.getElementById('plotFrame9');
         | 
| 1146 | 
            +
                            frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
         | 
| 1147 | 
            +
                            frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
         | 
| 1148 | 
            +
                        });
         | 
| 1149 | 
            +
                    </script>
         | 
| 1150 | 
            +
             | 
| 1151 | 
            +
                    <!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
         | 
| 1152 |  | 
| 1153 | 
             
                    <p>Can we apply similar ideas to our sequence parallelism approach but inside in the modules where we apply Tensor Parallelism already, thereby also reducing the effect of sequence length? Yes, it’s time to talk about Context Parallelism, which you will find quite intuitive after all we’ve already convered.</p>
         | 
| 1154 |  | 
|  | |
| 1156 |  | 
| 1157 | 
             
                    <p>The idea of Context Parallelism is quite simple; just like Sequence Parallelism, we’ll split the input along the sequence dimension but we now apply this splitting along the full model, instead of only the sequence parallel regions of the model as we’ve done previous with Tensor + Sequence Parallelism.</p>
         | 
| 1158 |  | 
| 1159 | 
            +
                    <!-- <p><img alt="cp_8Bmemoryusage.svg" src="/assets/images/cp_8Bmemoryusage.svg" /></p>
         | 
| 1160 | 
            +
             -->
         | 
| 1161 | 
             
                    <p>Splitting the sequence doesn't affect most modules like MLP and LayerNorm, where each token is processed independently. It also doesn’t require expensive communication like TP, as only the inputs are split and not the weight matrices. Just like data parallelism, after computing the gradients, an all-reduce operation is initiated to synchronize the gradients across the context parallelism group.</p>
         | 
| 1162 |  | 
| 1163 | 
             
                    <p>There is one important exception though, which is the <strong><em>attention module</em></strong>. In this module each token needs to access key/value pairs from <strong>all</strong> other sequence tokens or in the case of causal attention at least attends to each previous token.</p>
         | 
|  | |
| 1238 |  | 
| 1239 | 
             
                    <p>In the TP section we saw that if we try to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) we hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we perform it across several nodes:</p>
         | 
| 1240 |  | 
| 1241 | 
            +
                    <iframe class="l-body-outset" id="plotFrame11" src="assets/data/benchmarks/pp_comm_bandwidth.html" width="90%" scrolling="no" frameborder="0"></iframe>
         | 
| 1242 | 
            +
                    <script>
         | 
| 1243 | 
            +
                        window.addEventListener('load', function() {
         | 
| 1244 | 
            +
                            const frame = document.getElementById('plotFrame11');
         | 
| 1245 | 
            +
                            frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
         | 
| 1246 | 
            +
                            frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
         | 
| 1247 | 
            +
                        });
         | 
| 1248 | 
            +
                    </script>
         | 
| 1249 | 
            +
             | 
| 1250 | 
            +
                    <!-- <p><img alt="pp_comm_bandwidth.svg" src="/assets/images/pp_comm_bandwidth.svg" /></p> -->
         | 
| 1251 | 
             
                    <p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
         | 
| 1252 |  | 
| 1253 | 
             
                    <p>Sequence and context parallelism can help for long sequences but don’t help much if sequence length is not the root cause of our memory issues but rather the size of the model itself. For large model (70B+), the size of the weights alone can already push past the limits of the 4-8 GPUs on a single node. We can solve this issue by summoning the fourth (and last) parallelism dimension: “pipeline parallelism”.</p>
         | 
    	
        src/index.html
    CHANGED
    
    | @@ -919,7 +919,15 @@ | |
| 919 |  | 
| 920 | 
             
                    <p>However, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
         | 
| 921 |  | 
| 922 | 
            -
                    < | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 923 |  | 
| 924 | 
             
                    <p>As we can see, increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU. While tensor parallelism does help reduce activation memory in attention and feedforward layers by sharding the matrix multiplications across GPUs, we don't get the full memory benefits we could. This is because operations like layer normalization and dropout still require gathering the full activations on each GPU, partially negating the memory savings. We can do better by finding ways to parallelize these remaining operations as well.</p>
         | 
| 925 |  | 
| @@ -1068,7 +1076,15 @@ | |
| 1068 |  | 
| 1069 | 
             
                    <p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
         | 
| 1070 |  | 
| 1071 | 
            -
                    < | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1072 |  | 
| 1073 | 
             
                    <p>Does that mean that SP incurs more communication than TP? Well, yes and no. In the forward of a vanilla TP we had two all-reduce per transformer block, and in SP we have two all-gather and two reduce-scatter per transformer block. So SP does twice the number of communication operations as TP. But since an all-reduce operation can be broken down into to an all-gather + reduce-scatter (see in [TODO: Appendix link]) they’re actually equivalent in terms of communication. Same reasoning for backward as we just use the conjugate of each operation (no-op ↔ allreduce and allgather ↔ reducescatter).</p>
         | 
| 1074 |  | 
| @@ -1079,9 +1095,10 @@ | |
| 1079 | 
             
                    <p>Besides the fact that TP requires communications in each layer, it also can’t easily be overlapped with compute, which makes throughput heavily dependent on the communication bandwidth. This is why TP is usually done only within a node (TP≤8).</p>
         | 
| 1080 |  | 
| 1081 |  | 
| 1082 | 
            -
                    <aside>Overlapping communication with computation for TP is an active area of research, with recent work like Domino <d-cite bibtex-key="wang2024domino"></d-cite> exploring novel techniques to maximize this overlap.  | 
| 1083 |  | 
| 1084 | 
             
                    <p>As you might expect, this communication overhead becomes increasingly problematic as we scale up tensor parallelism. To illustrate this, let’s check throughput as we scale TP with SP for a 3B model:</p>
         | 
|  | |
| 1085 |  | 
| 1086 | 
             
                    <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/tp_sp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
         | 
| 1087 | 
             
                    <script>
         | 
| @@ -1122,7 +1139,16 @@ | |
| 1122 |  | 
| 1123 | 
             
                    <p>Even if we use full recomputation of the activations, which comes at a heavy compute overhead (30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length:</p>
         | 
| 1124 |  | 
| 1125 | 
            -
                    < | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1126 |  | 
| 1127 | 
             
                    <p>Can we apply similar ideas to our sequence parallelism approach but inside in the modules where we apply Tensor Parallelism already, thereby also reducing the effect of sequence length? Yes, it’s time to talk about Context Parallelism, which you will find quite intuitive after all we’ve already convered.</p>
         | 
| 1128 |  | 
| @@ -1130,8 +1156,8 @@ | |
| 1130 |  | 
| 1131 | 
             
                    <p>The idea of Context Parallelism is quite simple; just like Sequence Parallelism, we’ll split the input along the sequence dimension but we now apply this splitting along the full model, instead of only the sequence parallel regions of the model as we’ve done previous with Tensor + Sequence Parallelism.</p>
         | 
| 1132 |  | 
| 1133 | 
            -
                    <p><img alt="cp_8Bmemoryusage.svg" src="/assets/images/cp_8Bmemoryusage.svg" /></p>
         | 
| 1134 | 
            -
             | 
| 1135 | 
             
                    <p>Splitting the sequence doesn't affect most modules like MLP and LayerNorm, where each token is processed independently. It also doesn’t require expensive communication like TP, as only the inputs are split and not the weight matrices. Just like data parallelism, after computing the gradients, an all-reduce operation is initiated to synchronize the gradients across the context parallelism group.</p>
         | 
| 1136 |  | 
| 1137 | 
             
                    <p>There is one important exception though, which is the <strong><em>attention module</em></strong>. In this module each token needs to access key/value pairs from <strong>all</strong> other sequence tokens or in the case of causal attention at least attends to each previous token.</p>
         | 
| @@ -1212,7 +1238,16 @@ | |
| 1212 |  | 
| 1213 | 
             
                    <p>In the TP section we saw that if we try to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) we hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we perform it across several nodes:</p>
         | 
| 1214 |  | 
| 1215 | 
            -
                    < | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1216 | 
             
                    <p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
         | 
| 1217 |  | 
| 1218 | 
             
                    <p>Sequence and context parallelism can help for long sequences but don’t help much if sequence length is not the root cause of our memory issues but rather the size of the model itself. For large model (70B+), the size of the weights alone can already push past the limits of the 4-8 GPUs on a single node. We can solve this issue by summoning the fourth (and last) parallelism dimension: “pipeline parallelism”.</p>
         | 
|  | |
| 919 |  | 
| 920 | 
             
                    <p>However, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
         | 
| 921 |  | 
| 922 | 
            +
                    <iframe class="l-body-outset" id="plotFrame7" src="assets/data/benchmarks/tp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
         | 
| 923 | 
            +
                    <script>
         | 
| 924 | 
            +
                        window.addEventListener('load', function() {
         | 
| 925 | 
            +
                            const frame = document.getElementById('plotFrame7');
         | 
| 926 | 
            +
                            frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
         | 
| 927 | 
            +
                            frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
         | 
| 928 | 
            +
                        });
         | 
| 929 | 
            +
                    </script>
         | 
| 930 | 
            +
                    <!-- <p><img alt="tp_memoryusage.svg" src="/assets/images/tp_memoryusage.svg" /></p> -->
         | 
| 931 |  | 
| 932 | 
             
                    <p>As we can see, increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU. While tensor parallelism does help reduce activation memory in attention and feedforward layers by sharding the matrix multiplications across GPUs, we don't get the full memory benefits we could. This is because operations like layer normalization and dropout still require gathering the full activations on each GPU, partially negating the memory savings. We can do better by finding ways to parallelize these remaining operations as well.</p>
         | 
| 933 |  | 
|  | |
| 1076 |  | 
| 1077 | 
             
                    <p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
         | 
| 1078 |  | 
| 1079 | 
            +
                    <iframe class="l-body-outset" id="plotFrame8" src="assets/data/benchmarks/tp_sp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
         | 
| 1080 | 
            +
                    <script>
         | 
| 1081 | 
            +
                        window.addEventListener('load', function() {
         | 
| 1082 | 
            +
                            const frame = document.getElementById('plotFrame8');
         | 
| 1083 | 
            +
                            frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
         | 
| 1084 | 
            +
                            frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
         | 
| 1085 | 
            +
                        });
         | 
| 1086 | 
            +
                    </script>
         | 
| 1087 | 
            +
                    <!-- <p><img alt="tp_sp_memoryusage.svg" src="/assets/images/tp_sp_memoryusage.svg" /></p> -->
         | 
| 1088 |  | 
| 1089 | 
             
                    <p>Does that mean that SP incurs more communication than TP? Well, yes and no. In the forward of a vanilla TP we had two all-reduce per transformer block, and in SP we have two all-gather and two reduce-scatter per transformer block. So SP does twice the number of communication operations as TP. But since an all-reduce operation can be broken down into to an all-gather + reduce-scatter (see in [TODO: Appendix link]) they’re actually equivalent in terms of communication. Same reasoning for backward as we just use the conjugate of each operation (no-op ↔ allreduce and allgather ↔ reducescatter).</p>
         | 
| 1090 |  | 
|  | |
| 1095 | 
             
                    <p>Besides the fact that TP requires communications in each layer, it also can’t easily be overlapped with compute, which makes throughput heavily dependent on the communication bandwidth. This is why TP is usually done only within a node (TP≤8).</p>
         | 
| 1096 |  | 
| 1097 |  | 
| 1098 | 
            +
                    <aside>Overlapping communication with computation for TP is an active area of research, with recent work like Domino <d-cite bibtex-key="wang2024domino"></d-cite> exploring novel techniques to maximize this overlap. </aside>
         | 
| 1099 |  | 
| 1100 | 
             
                    <p>As you might expect, this communication overhead becomes increasingly problematic as we scale up tensor parallelism. To illustrate this, let’s check throughput as we scale TP with SP for a 3B model:</p>
         | 
| 1101 | 
            +
                    <aside> For example, Megatron-LM/Nanotron implement a partial overlapping of all-gather with FC1 computation, and we expect to see more innovations in this space as the field continues to evolve.</aside>
         | 
| 1102 |  | 
| 1103 | 
             
                    <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/tp_sp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
         | 
| 1104 | 
             
                    <script>
         | 
|  | |
| 1139 |  | 
| 1140 | 
             
                    <p>Even if we use full recomputation of the activations, which comes at a heavy compute overhead (30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length:</p>
         | 
| 1141 |  | 
| 1142 | 
            +
                    <iframe class="l-body-outset" id="plotFrame9" src="assets/data/benchmarks/cp_8Bmemoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
         | 
| 1143 | 
            +
                    <script>
         | 
| 1144 | 
            +
                        window.addEventListener('load', function() {
         | 
| 1145 | 
            +
                            const frame = document.getElementById('plotFrame9');
         | 
| 1146 | 
            +
                            frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
         | 
| 1147 | 
            +
                            frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
         | 
| 1148 | 
            +
                        });
         | 
| 1149 | 
            +
                    </script>
         | 
| 1150 | 
            +
             | 
| 1151 | 
            +
                    <!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
         | 
| 1152 |  | 
| 1153 | 
             
                    <p>Can we apply similar ideas to our sequence parallelism approach but inside in the modules where we apply Tensor Parallelism already, thereby also reducing the effect of sequence length? Yes, it’s time to talk about Context Parallelism, which you will find quite intuitive after all we’ve already convered.</p>
         | 
| 1154 |  | 
|  | |
| 1156 |  | 
| 1157 | 
             
                    <p>The idea of Context Parallelism is quite simple; just like Sequence Parallelism, we’ll split the input along the sequence dimension but we now apply this splitting along the full model, instead of only the sequence parallel regions of the model as we’ve done previous with Tensor + Sequence Parallelism.</p>
         | 
| 1158 |  | 
| 1159 | 
            +
                    <!-- <p><img alt="cp_8Bmemoryusage.svg" src="/assets/images/cp_8Bmemoryusage.svg" /></p>
         | 
| 1160 | 
            +
             -->
         | 
| 1161 | 
             
                    <p>Splitting the sequence doesn't affect most modules like MLP and LayerNorm, where each token is processed independently. It also doesn’t require expensive communication like TP, as only the inputs are split and not the weight matrices. Just like data parallelism, after computing the gradients, an all-reduce operation is initiated to synchronize the gradients across the context parallelism group.</p>
         | 
| 1162 |  | 
| 1163 | 
             
                    <p>There is one important exception though, which is the <strong><em>attention module</em></strong>. In this module each token needs to access key/value pairs from <strong>all</strong> other sequence tokens or in the case of causal attention at least attends to each previous token.</p>
         | 
|  | |
| 1238 |  | 
| 1239 | 
             
                    <p>In the TP section we saw that if we try to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) we hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we perform it across several nodes:</p>
         | 
| 1240 |  | 
| 1241 | 
            +
                    <iframe class="l-body-outset" id="plotFrame11" src="assets/data/benchmarks/pp_comm_bandwidth.html" width="90%" scrolling="no" frameborder="0"></iframe>
         | 
| 1242 | 
            +
                    <script>
         | 
| 1243 | 
            +
                        window.addEventListener('load', function() {
         | 
| 1244 | 
            +
                            const frame = document.getElementById('plotFrame11');
         | 
| 1245 | 
            +
                            frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
         | 
| 1246 | 
            +
                            frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
         | 
| 1247 | 
            +
                        });
         | 
| 1248 | 
            +
                    </script>
         | 
| 1249 | 
            +
             | 
| 1250 | 
            +
                    <!-- <p><img alt="pp_comm_bandwidth.svg" src="/assets/images/pp_comm_bandwidth.svg" /></p> -->
         | 
| 1251 | 
             
                    <p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
         | 
| 1252 |  | 
| 1253 | 
             
                    <p>Sequence and context parallelism can help for long sequences but don’t help much if sequence length is not the root cause of our memory issues but rather the size of the model itself. For large model (70B+), the size of the weights alone can already push past the limits of the 4-8 GPUs on a single node. We can solve this issue by summoning the fourth (and last) parallelism dimension: “pipeline parallelism”.</p>
         | 
