Last compiled on May, 2025



The first step is to select a set of names (or X’s, in NSUM terminology) that represent social groups (by age, ethnic background, gender, and so forth) well. Hence, we first need to find out what those names are. In a first step, we therefore look at name popularity from the Meerten Voornamenbank that shows name popularity by year. We do this via relatively straightforward crawl of that website.



1 Initatiating R environment

Start out with a custom function to load a set of required packages.

# packages and read data
rm(list = ls())

# scraper to collect popularity lists of names in the Netherlands, per year Rense Corten, Utrecht
# University, April 2021

# ----------- LOAD THE REQUIRED PACKAGES ---------------- |

# (c) Jochem Tolsma
fpackage.check <- function(packages) {
    lapply(packages, FUN = function(x) {
        if (!require(x, character.only = TRUE)) {
            install.packages(x, dependencies = TRUE)
            library(x, character.only = TRUE)
        }
    })
}
packages = c("tidyverse", "rvest", "polite")
fpackage.check(packages)



2 Custom functions

Then declare a custom function that does a polite crawl of first name data from the Meertens Voornamenbank.

# ----------- FUNCTIONS ---------------- |

get_year_names <- function(session, year){
  #year = 2014
  yr_path = paste("nvb/topnamen/land/Nederland/", as.character(year),sep="") # set the path for the specific year's webpage
  
  year_session <-nod(session, path = yr_path) # agree changing of the path with the host (assuming I have already "bowed" for the higher-level path)
  
  year_page <- scrape(year_session) # get the page for this year
  
  all_names <-year_page %>%  # parse the page as a table. Turns out this is a list of three tables; we need numbers 2 and 3
    html_table()
  
  boy_names <- all_names[[2]] # second table from the list of three
  colnames(boy_names) <- c("rank", "name", "count") #seems like this should be easier...
  boy_names <- boy_names %>% 
    as.data.frame() %>% 
    mutate(is_girl_name = 0) # add a gender dummy
  
  girl_names <- all_names[[3]] # third table from the list of three
  colnames(girl_names) <- c("rank", "name", "count") #seems like this should be easier...
  girl_names <- girl_names %>% 
    as.data.frame() %>% 
    mutate(is_girl_name = 1)  # add a gender dummy
  
  all_names <- rbind(girl_names, boy_names) %>% # combine the two as a new data frame
    mutate(year = year)  
  return(all_names)
}



3 Crawling

The next step is actually crawling the names in a specific set of years and bind that in a data frame.

# ----------- IMPLEMENT THE SCRAPER ---------------- |

# check permissions and introduce myself to the host
session <- bow("https://www.meertens.knaw.nl/nvb/", user_agent =  "R. Corten, Universiteit Utrecht", delay = 1)
session

all_names <- data.frame( # initialize the empty data frame for the results
  rank = integer(),
  name = character(),
  count = integer(),
  is_girl_name = integer(),
  year = integer()
)

startyear = 1950
endyear = 2014

for(i in endyear:startyear){ # loop over all years
  print(paste("scraping year",i))
  names_year <- get_year_names(session, i)
  all_names <- rbind(all_names, names_year)
}



4 Saving the data

The final step here is to save it to a CSV file so that we can inspect numbers of this specific set of names to select our X’s from.

write.csv(all_names, file = paste("all_names_", as.character(startyear), as.character(endyear), ".csv",
    sep = ""), row.names = FALSE)
LS0tCnRpdGxlOiAiRmluZGluZyByZWxldmFudCBuYW1lcyAoWCdzKSIKI2JpYmxpb2dyYXBoeTogcmVmZXJlbmNlcy5iaWIKYXV0aG9yOiAiUmVuc2UgQ29ydGVuIgotLS0KCmBgYHtyLCBnbG9iYWxzZXR0aW5ncywgZWNobz1GQUxTRSwgd2FybmluZz1GQUxTRSwgcmVzdWx0cz0naGlkZSd9CmxpYnJhcnkoa25pdHIpCgprbml0cjo6b3B0c19jaHVuayRzZXQoZWNobyA9IFRSVUUpCm9wdHNfY2h1bmskc2V0KHRpZHkub3B0cz1saXN0KHdpZHRoLmN1dG9mZj0xMDApLHRpZHk9VFJVRSwgd2FybmluZyA9IEZBTFNFLCBtZXNzYWdlID0gRkFMU0UsY29tbWVudCA9ICIjPiIsIGNhY2hlPVRSVUUsIGNsYXNzLnNvdXJjZT1jKCJ0ZXN0IiksIGNsYXNzLm91dHB1dD1jKCJ0ZXN0MiIpKQpvcHRpb25zKHdpZHRoID0gMTAwKQpyZ2w6OnNldHVwS25pdHIoKQoKCgpjb2xvcml6ZSA8LSBmdW5jdGlvbih4LCBjb2xvcikge3NwcmludGYoIjxzcGFuIHN0eWxlPSdjb2xvcjogJXM7Jz4lczwvc3Bhbj4iLCBjb2xvciwgeCkgfQoKYGBgCgpgYGB7ciBrbGlwcHksIGVjaG89RkFMU0UsIGluY2x1ZGU9VFJVRX0Ka2xpcHB5OjprbGlwcHkocG9zaXRpb24gPSBjKCd0b3AnLCAncmlnaHQnKSkKI2tsaXBweTo6a2xpcHB5KGNvbG9yID0gJ2RhcmtyZWQnKQoja2xpcHB5OjprbGlwcHkodG9vbHRpcF9tZXNzYWdlID0gJ0NsaWNrIHRvIGNvcHknLCB0b29sdGlwX3N1Y2Nlc3MgPSAnRG9uZScpCmBgYAoKTGFzdCBjb21waWxlZCBvbiBgciBmb3JtYXQoU3lzLnRpbWUoKSwgJyVCLCAlWScpYAoKPGJyPgoKLS0tLQoKVGhlIGZpcnN0IHN0ZXAgaXMgdG8gc2VsZWN0IGEgc2V0IG9mIG5hbWVzIChvciBYJ3MsIGluIE5TVU0gdGVybWlub2xvZ3kpIHRoYXQgcmVwcmVzZW50IHNvY2lhbCBncm91cHMgKGJ5IGFnZSwgZXRobmljIGJhY2tncm91bmQsIGdlbmRlciwgYW5kIHNvIGZvcnRoKSB3ZWxsLiBIZW5jZSwgd2UgZmlyc3QgbmVlZCB0byBmaW5kIG91dCB3aGF0IHRob3NlIG5hbWVzIGFyZS4gSW4gYSBmaXJzdCBzdGVwLCB3ZSB0aGVyZWZvcmUgbG9vayBhdCBuYW1lIHBvcHVsYXJpdHkgZnJvbSB0aGUgW01lZXJ0ZW4gVm9vcm5hbWVuYmFua10oaHR0cHM6Ly93d3cubWVlcnRlbnMua25hdy5ubC9udmIvKSB0aGF0IHNob3dzIG5hbWUgcG9wdWxhcml0eSBieSB5ZWFyLiBXZSBkbyB0aGlzIHZpYSByZWxhdGl2ZWx5IHN0cmFpZ2h0Zm9yd2FyZCBjcmF3bCBvZiB0aGF0IHdlYnNpdGUuCgo8YnI+CgotLS0tCgojIEluaXRhdGlhdGluZyBSIGVudmlyb25tZW50CgpTdGFydCBvdXQgd2l0aCBhIGN1c3RvbSBmdW5jdGlvbiB0byBsb2FkIGEgc2V0IG9mIHJlcXVpcmVkIHBhY2thZ2VzLgogIApgYGB7ciwgZXZhbD1GQUxTRX0KIyBwYWNrYWdlcyBhbmQgcmVhZCBkYXRhCnJtKGxpc3QgPSBscygpKQoKIyBzY3JhcGVyIHRvIGNvbGxlY3QgcG9wdWxhcml0eSBsaXN0cyBvZiBuYW1lcyBpbiB0aGUgTmV0aGVybGFuZHMsIHBlciB5ZWFyCiMgUmVuc2UgQ29ydGVuLCBVdHJlY2h0IFVuaXZlcnNpdHksIEFwcmlsIDIwMjEKCiMgLS0tLS0tLS0tLS0gTE9BRCBUSEUgUkVRVUlSRUQgUEFDS0FHRVMgLS0tLS0tLS0tLS0tLS0tLSB8CgojIChjKSBKb2NoZW0gVG9sc21hCmZwYWNrYWdlLmNoZWNrIDwtIGZ1bmN0aW9uKHBhY2thZ2VzKSB7CiAgbGFwcGx5KHBhY2thZ2VzLCBGVU4gPSBmdW5jdGlvbih4KSB7CiAgICBpZiAoIXJlcXVpcmUoeCwgY2hhcmFjdGVyLm9ubHkgPSBUUlVFKSkgewogICAgICBpbnN0YWxsLnBhY2thZ2VzKHgsIGRlcGVuZGVuY2llcyA9IFRSVUUpCiAgICAgIGxpYnJhcnkoeCwgY2hhcmFjdGVyLm9ubHkgPSBUUlVFKQogICAgfQogIH0pCn0KcGFja2FnZXMgPSBjKCJ0aWR5dmVyc2UiLCAicnZlc3QiLCAicG9saXRlIikKZnBhY2thZ2UuY2hlY2socGFja2FnZXMpCmBgYAoKPGJyPgoKLS0tLQoKIyBDdXN0b20gZnVuY3Rpb25zCgpUaGVuIGRlY2xhcmUgYSBjdXN0b20gZnVuY3Rpb24gdGhhdCBkb2VzIGEgcG9saXRlIGNyYXdsIG9mIGZpcnN0IG5hbWUgZGF0YSBmcm9tIHRoZSBNZWVydGVucyBWb29ybmFtZW5iYW5rLgogIApgYGB7ciwgZXZhbD1GQUxTRX0KIyAtLS0tLS0tLS0tLSBGVU5DVElPTlMgLS0tLS0tLS0tLS0tLS0tLSB8CgpnZXRfeWVhcl9uYW1lcyA8LSBmdW5jdGlvbihzZXNzaW9uLCB5ZWFyKXsKICAjeWVhciA9IDIwMTQKICB5cl9wYXRoID0gcGFzdGUoIm52Yi90b3BuYW1lbi9sYW5kL05lZGVybGFuZC8iLCBhcy5jaGFyYWN0ZXIoeWVhciksc2VwPSIiKSAjIHNldCB0aGUgcGF0aCBmb3IgdGhlIHNwZWNpZmljIHllYXIncyB3ZWJwYWdlCiAgCiAgeWVhcl9zZXNzaW9uIDwtbm9kKHNlc3Npb24sIHBhdGggPSB5cl9wYXRoKSAjIGFncmVlIGNoYW5naW5nIG9mIHRoZSBwYXRoIHdpdGggdGhlIGhvc3QgKGFzc3VtaW5nIEkgaGF2ZSBhbHJlYWR5ICJib3dlZCIgZm9yIHRoZSBoaWdoZXItbGV2ZWwgcGF0aCkKICAKICB5ZWFyX3BhZ2UgPC0gc2NyYXBlKHllYXJfc2Vzc2lvbikgIyBnZXQgdGhlIHBhZ2UgZm9yIHRoaXMgeWVhcgogIAogIGFsbF9uYW1lcyA8LXllYXJfcGFnZSAlPiUgICMgcGFyc2UgdGhlIHBhZ2UgYXMgYSB0YWJsZS4gVHVybnMgb3V0IHRoaXMgaXMgYSBsaXN0IG9mIHRocmVlIHRhYmxlczsgd2UgbmVlZCBudW1iZXJzIDIgYW5kIDMKICAgIGh0bWxfdGFibGUoKQogIAogIGJveV9uYW1lcyA8LSBhbGxfbmFtZXNbWzJdXSAjIHNlY29uZCB0YWJsZSBmcm9tIHRoZSBsaXN0IG9mIHRocmVlCiAgY29sbmFtZXMoYm95X25hbWVzKSA8LSBjKCJyYW5rIiwgIm5hbWUiLCAiY291bnQiKSAjc2VlbXMgbGlrZSB0aGlzIHNob3VsZCBiZSBlYXNpZXIuLi4KICBib3lfbmFtZXMgPC0gYm95X25hbWVzICU+JSAKICAgIGFzLmRhdGEuZnJhbWUoKSAlPiUgCiAgICBtdXRhdGUoaXNfZ2lybF9uYW1lID0gMCkgIyBhZGQgYSBnZW5kZXIgZHVtbXkKICAKICBnaXJsX25hbWVzIDwtIGFsbF9uYW1lc1tbM11dICMgdGhpcmQgdGFibGUgZnJvbSB0aGUgbGlzdCBvZiB0aHJlZQogIGNvbG5hbWVzKGdpcmxfbmFtZXMpIDwtIGMoInJhbmsiLCAibmFtZSIsICJjb3VudCIpICNzZWVtcyBsaWtlIHRoaXMgc2hvdWxkIGJlIGVhc2llci4uLgogIGdpcmxfbmFtZXMgPC0gZ2lybF9uYW1lcyAlPiUgCiAgICBhcy5kYXRhLmZyYW1lKCkgJT4lIAogICAgbXV0YXRlKGlzX2dpcmxfbmFtZSA9IDEpICAjIGFkZCBhIGdlbmRlciBkdW1teQogIAogIGFsbF9uYW1lcyA8LSByYmluZChnaXJsX25hbWVzLCBib3lfbmFtZXMpICU+JSAjIGNvbWJpbmUgdGhlIHR3byBhcyBhIG5ldyBkYXRhIGZyYW1lCiAgICBtdXRhdGUoeWVhciA9IHllYXIpICAKICByZXR1cm4oYWxsX25hbWVzKQp9CmBgYAoKPGJyPgoKLS0tLQoKIyBDcmF3bGluZwoKVGhlIG5leHQgc3RlcCBpcyBhY3R1YWxseSBjcmF3bGluZyB0aGUgbmFtZXMgaW4gYSBzcGVjaWZpYyBzZXQgb2YgeWVhcnMgYW5kIGJpbmQgdGhhdCBpbiBhIGRhdGEgZnJhbWUuCgpgYGB7ciwgZXZhbD1GQUxTRX0KIyAtLS0tLS0tLS0tLSBJTVBMRU1FTlQgVEhFIFNDUkFQRVIgLS0tLS0tLS0tLS0tLS0tLSB8CgojIGNoZWNrIHBlcm1pc3Npb25zIGFuZCBpbnRyb2R1Y2UgbXlzZWxmIHRvIHRoZSBob3N0CnNlc3Npb24gPC0gYm93KCJodHRwczovL3d3dy5tZWVydGVucy5rbmF3Lm5sL252Yi8iLCB1c2VyX2FnZW50ID0gICJSLiBDb3J0ZW4sIFVuaXZlcnNpdGVpdCBVdHJlY2h0IiwgZGVsYXkgPSAxKQpzZXNzaW9uCgphbGxfbmFtZXMgPC0gZGF0YS5mcmFtZSggIyBpbml0aWFsaXplIHRoZSBlbXB0eSBkYXRhIGZyYW1lIGZvciB0aGUgcmVzdWx0cwogIHJhbmsgPSBpbnRlZ2VyKCksCiAgbmFtZSA9IGNoYXJhY3RlcigpLAogIGNvdW50ID0gaW50ZWdlcigpLAogIGlzX2dpcmxfbmFtZSA9IGludGVnZXIoKSwKICB5ZWFyID0gaW50ZWdlcigpCikKCnN0YXJ0eWVhciA9IDE5NTAKZW5keWVhciA9IDIwMTQKCmZvcihpIGluIGVuZHllYXI6c3RhcnR5ZWFyKXsgIyBsb29wIG92ZXIgYWxsIHllYXJzCiAgcHJpbnQocGFzdGUoInNjcmFwaW5nIHllYXIiLGkpKQogIG5hbWVzX3llYXIgPC0gZ2V0X3llYXJfbmFtZXMoc2Vzc2lvbiwgaSkKICBhbGxfbmFtZXMgPC0gcmJpbmQoYWxsX25hbWVzLCBuYW1lc195ZWFyKQp9CmBgYAoKPGJyPgoKLS0tLQoKIyBTYXZpbmcgdGhlIGRhdGEKClRoZSBmaW5hbCBzdGVwIGhlcmUgaXMgdG8gc2F2ZSBpdCB0byBhIENTViBmaWxlIHNvIHRoYXQgd2UgY2FuIGluc3BlY3QgbnVtYmVycyBvZiB0aGlzIHNwZWNpZmljIHNldCBvZiBuYW1lcyB0byBzZWxlY3Qgb3VyIFgncyBmcm9tLgoKYGBge3IsIGV2YWw9RkFMU0V9CndyaXRlLmNzdihhbGxfbmFtZXMsIAogICAgICAgICAgZmlsZSA9IHBhc3RlKCJhbGxfbmFtZXNfIixhcy5jaGFyYWN0ZXIoc3RhcnR5ZWFyKSxhcy5jaGFyYWN0ZXIoZW5keWVhciksIi5jc3YiLCBzZXA9IiIpLAogICAgICAgICAgcm93Lm5hbWVzPUZBTFNFKQoKYGBgCgoKCgoKCg==