M. Kan, und H. Thi. Proceedings of the 14th ACM International Conference on Information and Knowledge Management, Seite 325--326. New York, NY, USA, ACM, (2005)
DOI: 10.1145/1099554.1099649
Zusammenfassung
We demonstrate the usefulness of the uniform resource locator (URL) alone in performing web page classification. This approach is faster than typical web page classification, as the pages do not have to be fetched and analyzed. Our approach segments the URL into meaningful chunks and adds component, sequential and orthographic features to model salient patterns. The resulting features are used in supervised maximum entropy modeling. We analyze our approach's effectiveness on two standardized domains. Our results show that in certain scenarios, URL-based methods approach the performance of current state-of-the-art full-text and link-based methods.
%0 Conference Paper
%1 Kan:2005:FWC:1099554.1099649
%A Kan, Min-Yen
%A Thi, Hoang Oanh Nguyen
%B Proceedings of the 14th ACM International Conference on Information and Knowledge Management
%C New York, NY, USA
%D 2005
%I ACM
%K classification phdproposal url
%P 325--326
%R 10.1145/1099554.1099649
%T Fast Webpage Classification Using URL Features
%U http://doi.acm.org/10.1145/1099554.1099649
%X We demonstrate the usefulness of the uniform resource locator (URL) alone in performing web page classification. This approach is faster than typical web page classification, as the pages do not have to be fetched and analyzed. Our approach segments the URL into meaningful chunks and adds component, sequential and orthographic features to model salient patterns. The resulting features are used in supervised maximum entropy modeling. We analyze our approach's effectiveness on two standardized domains. Our results show that in certain scenarios, URL-based methods approach the performance of current state-of-the-art full-text and link-based methods.
%@ 1-59593-140-6
@inproceedings{Kan:2005:FWC:1099554.1099649,
abstract = {We demonstrate the usefulness of the uniform resource locator (URL) alone in performing web page classification. This approach is faster than typical web page classification, as the pages do not have to be fetched and analyzed. Our approach segments the URL into meaningful chunks and adds component, sequential and orthographic features to model salient patterns. The resulting features are used in supervised maximum entropy modeling. We analyze our approach's effectiveness on two standardized domains. Our results show that in certain scenarios, URL-based methods approach the performance of current state-of-the-art full-text and link-based methods.},
acmid = {1099649},
added-at = {2015-03-08T22:19:55.000+0100},
address = {New York, NY, USA},
author = {Kan, Min-Yen and Thi, Hoang Oanh Nguyen},
biburl = {https://www.bibsonomy.org/bibtex/2da99dc0b038a3243b400387bb61b3548/asmelash},
booktitle = {Proceedings of the 14th ACM International Conference on Information and Knowledge Management},
description = {Fast webpage classification using URL features},
doi = {10.1145/1099554.1099649},
interhash = {40d1c3c0276a53a3a2220becbb541354},
intrahash = {da99dc0b038a3243b400387bb61b3548},
isbn = {1-59593-140-6},
keywords = {classification phdproposal url},
location = {Bremen, Germany},
numpages = {2},
pages = {325--326},
publisher = {ACM},
series = {CIKM '05},
timestamp = {2015-03-08T22:19:55.000+0100},
title = {Fast Webpage Classification Using URL Features},
url = {http://doi.acm.org/10.1145/1099554.1099649},
year = 2005
}