We investigate the subtle cues to user identity that may be exploited
in attacks on the privacy of users in web search query logs. We
study the application of simple classifiers to map a sequence of
queries into the gender, age, and location of the user issuing the
queries. We then show how these classifiers may be carefully combined
at multiple granularities to map a sequence of queries into a
set of candidate users that is 300-600 times smaller than random
chance would allow. We show that this approach remains accurate
even after removing personally identifiable information such
as names/numbers or limiting the size of the query log.
We also present a new attack in which a real-world acquaintance
of a user attempts to identify that user in a large query log, using
personal information. We show that combinations of small pieces
of information about terms a user would probably search for can be
highly effective in identifying the sessions of that user.
We conclude that known schemes to release even heavily scrubbed
query logs that contain session information have significant privacy
risks.
%0 Conference Paper
%1 paper:jones:2007
%A Jones, Rosie
%A Kumar, Ravi
%A Pang, Bo
%A Tomkins, Andrew
%B CIKM '07: Proceedings of the sixteenth ACM conference on Conference on information and knowledge management
%C New York, NY, USA
%D 2007
%I ACM
%K 2007 ethics query reading-group
%P 909--914
%R http://doi.acm.org/10.1145/1321440.1321573
%T "I know what you did last summer": query logs and user privacy
%U http://portal.acm.org/citation.cfm?id=1321440.1321573&coll=GUIDE&dl=GUIDE
%X We investigate the subtle cues to user identity that may be exploited
in attacks on the privacy of users in web search query logs. We
study the application of simple classifiers to map a sequence of
queries into the gender, age, and location of the user issuing the
queries. We then show how these classifiers may be carefully combined
at multiple granularities to map a sequence of queries into a
set of candidate users that is 300-600 times smaller than random
chance would allow. We show that this approach remains accurate
even after removing personally identifiable information such
as names/numbers or limiting the size of the query log.
We also present a new attack in which a real-world acquaintance
of a user attempts to identify that user in a large query log, using
personal information. We show that combinations of small pieces
of information about terms a user would probably search for can be
highly effective in identifying the sessions of that user.
We conclude that known schemes to release even heavily scrubbed
query logs that contain session information have significant privacy
risks.
%@ 978-1-59593-803-9
@inproceedings{paper:jones:2007,
abstract = {We investigate the subtle cues to user identity that may be exploited
in attacks on the privacy of users in web search query logs. We
study the application of simple classifiers to map a sequence of
queries into the gender, age, and location of the user issuing the
queries. We then show how these classifiers may be carefully combined
at multiple granularities to map a sequence of queries into a
set of candidate users that is 300-600 times smaller than random
chance would allow. We show that this approach remains accurate
even after removing personally identifiable information such
as names/numbers or limiting the size of the query log.
We also present a new attack in which a real-world acquaintance
of a user attempts to identify that user in a large query log, using
personal information. We show that combinations of small pieces
of information about terms a user would probably search for can be
highly effective in identifying the sessions of that user.
We conclude that known schemes to release even heavily scrubbed
query logs that contain session information have significant privacy
risks.},
added-at = {2008-10-28T12:57:06.000+0100},
address = {New York, NY, USA},
author = {Jones, Rosie and Kumar, Ravi and Pang, Bo and Tomkins, Andrew},
biburl = {https://www.bibsonomy.org/bibtex/2153f00de0cb4daed7c856f34f15c3f71/mschuber},
booktitle = {CIKM '07: Proceedings of the sixteenth ACM conference on Conference on information and knowledge management},
description = {"I know what you did last summer"},
doi = {http://doi.acm.org/10.1145/1321440.1321573},
interhash = {af09c9df301acf362da978ee56eb08b6},
intrahash = {153f00de0cb4daed7c856f34f15c3f71},
isbn = {978-1-59593-803-9},
keywords = {2007 ethics query reading-group},
location = {Lisbon, Portugal},
pages = {909--914},
publisher = {ACM},
timestamp = {2008-10-28T12:57:06.000+0100},
title = {"I know what you did last summer": query logs and user privacy},
url = {http://portal.acm.org/citation.cfm?id=1321440.1321573&coll=GUIDE&dl=GUIDE},
year = 2007
}