@conference {202, title = {Liquid: Unifying Nearline and Offline Big Data Integration}, booktitle = {Biennial Conference on Innovative Data Systems Research (CIDR)}, year = {2015}, month = {01/2015}, publisher = {ACM}, organization = {ACM}, address = {Asilomar, CA, USA}, abstract = {

With more sophisticated data-parallel processing systems, the new bottleneck in data-intensive companies shifts from the back-end data systems to the data integration stack, which is responsible for the pre-processing of data for back-end applications. The use of back-end data systems with different access latencies and data integration requirements poses new challenges that current data integration stacks based on distributed file systems{\textemdash}proposed a decade ago for batch-oriented processing{\textemdash}cannot address.

In this paper, we describe Liquid, a data integration stack that provides low latency data access to support near real-time in addition to batch applications. It supports incremental processing, and is cost-efficient and highly available. Liquid has two layers: a processing layer based on a stateful stream processing model, and a messaging layer with a highly-available publish/subscribe system. We report our experience of a Liquid deployment with backend data systems at LinkedIn, a data-intensive company with over 300 million users.

}, keywords = {seep}, author = {Raul Castro Fernandez and Peter Pietzuch and Joel Koshy and Jay Kreps and Dong Lin and Neha Narkhede and Jun Rao and Chris Riccomini and Guozhang Wang} }