@article{jbp:/content/journals/10.1075/ijcl.10.4.07the, author = "Thelwall, Mike", title = "Creating and using Web corpora", journal= "International Journal of Corpus Linguistics", year = "2005", volume = "10", number = "4", pages = "517-541", doi = "https://doi.org/10.1075/ijcl.10.4.07the", url = "https://www.jbe-platform.com/content/journals/10.1075/ijcl.10.4.07the", publisher = "John Benjamins", issn = "1384-6655", type = "Journal Article", keywords = "academic language", keywords = "web corpus", keywords = "web", abstract = "The Web has recently been used as a corpus for linguistic investigations, often with the help of a commercial search engine. We discuss some potential problems with collecting data from commercial search engine and with using the Web as a corpus. We outline an alternative strategy for data collection, using a personal Web crawler. As a case study, the university Web sites of three nations (Australia, New Zealand and the UK) were crawled. The most frequent words were broadly consistent with non-Web written English, but with some academic-related words amongst the top 50 most frequent. It was also evident that the university Web sites contained a significant amount of non-English text, and academic Web English seems to be more future-oriented than British National Corpus written English.", }