@conference {202,
	title = {Liquid: Unifying Nearline and Offline Big Data Integration},
	booktitle = {Biennial Conference on Innovative Data Systems Research (CIDR)},
	year = {2015},
	month = {01/2015},
	publisher = {ACM},
	organization = {ACM},
	address = {Asilomar, CA, USA},
	abstract = {<p>With more sophisticated data-parallel processing systems, the new bottleneck in data-intensive companies shifts from the back-end data systems to the <em>data integration stack</em>, which is responsible for the pre-processing of data for back-end applications. The use of back-end data systems with different access latencies and data integration requirements poses new challenges that current data integration stacks based on distributed file systems{\textemdash}proposed a decade ago for batch-oriented processing{\textemdash}cannot address.</p>

<p>In this paper, we describe <em>Liquid</em>, a data integration stack that provides low latency data access to support near real-time in addition to batch applications. It supports incremental processing, and is cost-efficient and highly available. Liquid has two layers: a <em>processing layer</em> based on a stateful stream processing model, and a <em>messaging layer</em> with a highly-available publish/subscribe system. We report our experience of a Liquid deployment with backend data systems at LinkedIn, a data-intensive company with over 300 million users.</p>},
	keywords = {seep},
	author = {Raul Castro Fernandez and Peter Pietzuch and Joel Koshy and Jay Kreps and Dong Lin and Neha Narkhede and Jun Rao and Chris Riccomini and Guozhang Wang}
}