@proceedings {339, title = {SwiftAnalytics: Optimizing Object Stores for Big Data Analytics}, journal = {IEEE International Conference on Cloud Engineering (IC2E)}, year = {2017}, month = {04/2017}, publisher = {IEEE}, address = {Vancouver, Canada}, abstract = {Due to their scalability and low cost, object-based storage systems are an attractive storage solution and widely deployed. To gain valuable insight from the data residing in object storage but avoid expensive copying to a distributed filesystem (e.g. HDFS), it would be natural to directly use them as a storage backend for data-parallel analytics frameworks such as Spark or MapReduce. Unfortunately, executing data-parallel frameworks on object storage exhibits severe performance problems, reducing average job completion times by up to 6.5{\texttimes}.
We identify the two most severe performance problems when running data-parallel frameworks on the OpenStack Swift object storage system in comparison to the HDFS distributed filesystem: (i) the fixed mapping of object names to storage nodes prevents local writes and adds delay when objects are renamed; (ii) the coarser granularity of objects compared to blocks reduces data locality during reads. We propose the SwiftAnalytics object storage system to address them: (i) it uses locality-aware writes to control an object{\textquoteright}s location and eliminate unnecessary I/O related to renames during job completion, speeding up analytics jobs by up to 5.1{\texttimes}; (ii) it transparently chunks objects into smaller sized parts to improve data-locality, leading to up to 3.4{\texttimes} faster reads.}, keywords = {naas}, author = {Lukas Rupprecht and Rui Zhang and Bill Owen and Peter Pietzuch and Dean Hildebrand} }