EPINetz-Policy-Parser/classify_documents.R at master · TimBMK/EPINetz-Policy-Parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
classify_documents <- function( # the working horse function to classify documents with the walk terms calculated in previous steps
    walk_terms, # data with terms generated by get_rwr_terms()
    group_name = "policy_field", # name of the group variable which specifies the classes the documents will be sorted into
    document_tokens, # data with the documents to be classified. Expects tokenized data, with a doc_id and one token per row
    tokens_var, # name of the tokens variable within the documents dataframe. Usually "tokens", "lemma", etc.
    doc_id, # name of the doc_id variable in document_tokens
    classification_measure = c("Score", "ScoreMean", # set the measure to use for classification here. Must be present in the data
                               "ScoreNorm", "ScoreNormMean",
                               "ScoreNormGroup", "ScoreNormGroupMean"),
    classification_cutoff = NULL, # Should a cutoff be set to filter the walk_terms?? Applies to classification measure. NULL to skip. This is useful if a more strict cutoff is desired than in get_rwr_terms()
    keep_seed_terms = TRUE, # should seed terms be kept even if their score is lower than the cutoff? only applies if a cutoff is specified
    seedterm_value = NULL, # Should Seed Term Scores values be set to a fixed value for classification? NULL to skip. Otherwise enter a numerical value. Applies to classification_measure only
    normalize_scores = c("doc", "group", NULL), # should the score in the documents be normalized between 0 and 1? Can be doc (normalize within each document), group (normalize for each group), or NULL to skip
    cutoff_value = NULL, # a numerical value to set. Scores below will be set to 0. NULL to skip
    cutoff_quantile = FALSE, # if TRUE, the cutoff_value specifies a quantile, rather than a fixed value
    cutoff_normalized_scores = FALSE, # if TRUE, the cutoff is applied to the normalized scores. Otherwise, normalization is applied after the cutoff
    minimum_results = NULL, # Numerical minimum number of results for each group to be returned. Bypasses the cutoff_value as needed. NULL to skip
    cut_frequent_group_terms = c(NULL, numeric(), "auto"),  # Should terms appearing in numerous groups be cut?
                                      #  "auto" to cut terms appearing in more than 50% of the groups
                                      #  numeric value for a specific number
                                      #  NULL to skip
    return_walk_terms = TRUE, # should the processed walk terms be returned for further analysis and transparency?
    return_unclassified_docs = TRUE, # should the IDs of the unlassified docs be returned?
              # setting return_walk_terms or return_unclassified_docs to TRUE returns a list of dataframes rather than a single dataframe
    verbose = TRUE # should the number of unclassified documents be reported?
    ){

  ## Data and Input Checks

  rlang::arg_match(classification_measure)

  rlang::arg_match(normalize_scores)

  if (!is.null(cut_frequent_group_terms)) {
    if (!is.na(cut_frequent_group_terms)) {
      if (!(is.numeric(cut_frequent_group_terms) |
            cut_frequent_group_terms == "auto")) {
        stop(
          paste0(
            "cut_frequent_group_terms must be NULL, 'auto', or a numeric value, not '",
            cut_frequent_group_terms,
            "'.\n"
          )
        )
      }

    } else {
      stop("cut_frequent_group_terms must be NULL, 'auto', or a numeric value, not NA.\n")

    }
  }

  if (!(classification_measure %in% names(walk_terms))) {
    stop(paste("classification_measure", classification_measure,
               "not present in walk_terms data\n"))
  }

  if (!(doc_id %in% names(document_tokens))) {
    stop(paste("doc_id", doc_id, "not present in document_tokens data\n"))
  }

  ## Data Prep

  ### apply cutoff
  if (!is.null(classification_cutoff)) {
    walk_terms <-
      walk_terms %>% dplyr::filter(!!as.name(classification_measure) >= classification_cutoff |
                                     seed_term == TRUE)
  }

  ### overwrite seedterm values (if desired)
  if (!is.null(seedterm_value)) {
    walk_terms <- walk_terms %>%
      dplyr::mutate(
        !!as.name(classification_measure) := dplyr::case_when(
          seed_term == TRUE ~ seedterm_value,
          .default = !!as.name(classification_measure)
        )
      )
  }

  ### calculate means (if necessary) to gain one score for each term in a policy field (rather then one for each seedterm-term connection)
  #### Note that if we calculate the means only now, means are calculated AFTER the initial filtering, and scores will accordingly be higher (all values below the walk_score threshold have already been dropped!)
  if (stringr::str_detect(classification_measure, "Mean")) {
    # if one of the mean scores is set as classification_measure, we simply use that
    classification_terms <- document_tokens %>%
      dplyr::semi_join(walk_terms, # filter for lemmas in the walk terms
                       dplyr::join_by(!!as.name(tokens_var) == NodeNames)) %>%
      dplyr::left_join( # add classification attributes
        walk_terms %>% dplyr::distinct(NodeNames,
                                       !!as.name(classification_measure),
                                       !!as.name(group_name)),
        dplyr::join_by(!!as.name(tokens_var) == NodeNames),
        relationship = "many-to-many"
      )  # multi-matches for a) terms in multiple docs, b) terms in multiple groups
  } else {
    # else, we need to calculate the mean for each term within a policy field first, in order to handle duplicates from different seed terms
    classification_terms <- document_tokens %>%
      dplyr::semi_join(walk_terms, # filter for token var in the walk terms
                       dplyr::join_by(!!as.name(tokens_var) == NodeNames)) %>%
      dplyr::left_join(
        walk_terms %>%
          dplyr::summarise(
            !!as.name(classification_measure) := mean(!!as.name(classification_measure)),
            .by = c(NodeNames, !!as.name(group_name))
          ),
        dplyr::join_by(!!as.name(tokens_var) == NodeNames),
        # add classification attributes
        relationship = "many-to-many"
      )
  }

  ## cut frequent policy terms
  if (!is.null(cut_frequent_group_terms)) {
    if (cut_frequent_group_terms == "auto") {
      classification_terms <- classification_terms %>%
        filter(!!as.name(tokens_var) %in% (classification_terms %>%
                             count(!!as.name(tokens_var), !!as.name(group_name)) %>%
                             count(!!as.name(tokens_var)) %>%
                             filter(n <= (distinct(classification_terms,
                                                   !!as.name(group_name)) %>%
                                            nrow() / 2)) %>%
                             pull(!!as.name(tokens_var))))
    } else {
      classification_terms <- classification_terms %>%
        filter(!!as.name(tokens_var) %in% (classification_terms %>%
                             count(!!as.name(tokens_var), !!as.name(group_name)) %>%
                             count(!!as.name(tokens_var)) %>%
                             filter(n <= cut_frequent_group_terms) %>%
                             pull(!!as.name(tokens_var))))
    }
  }


  ## classify

  classified_documents <- classification_terms %>%
    dplyr::summarize(score = sum(!!as.name(classification_measure)),
                     .by = c(!!as.name(doc_id), !!as.name(group_name))) %>% # sum policy scores by field and document
    tidyr::complete(!!as.name(doc_id),!!as.name(group_name),
                    fill = list(score = 0))


  ### apply cutoff before normalization
  if (!cutoff_normalized_scores & !is.null(cutoff_value)) { # set scores to 0 for lower quantiles

    if (!is.null(minimum_results)) { # preserve original values for the minimum results
      top_values <- classified_documents %>%
        dplyr::slice_max(score, n = minimum_results,
                         by = !!as.name(group_name)) %>%
        dplyr::rename(top_value = score) %>%
        dplyr::select({{doc_id}}, {{group_name}}, top_value)
    }

    if (cutoff_quantile) {

      quantile <- stats::quantile(classified_documents$score,
                                  cutoff_value)[[1]]
      classified_documents <- classified_documents %>%
        dplyr::mutate(score = dplyr::case_when(score < quantile ~ 0,
                                               .default = score))

      if (verbose) {
        cat(paste0("Setting score-values below ",
                   quantile, " (", cutoff_value,
                   " quantile) to 0. This step is applied before normalization.\n"))
      }
    } else {
      classified_documents <- classified_documents %>%
        dplyr::mutate(score = dplyr::case_when(score < cutoff_value ~ 0,
                                               .default = score))
      if (verbose) {
        cat(paste("Setting score-values below",
                  cutoff_value,
                  "to 0. This step is applied before normalization.\n"))
      }
    }

    if (!is.null(minimum_results)) {
      classified_documents <- classified_documents %>%
        dplyr::left_join(top_values, by = c(doc_id, group_name)) %>%
        dplyr::mutate(top_value = dplyr::case_when( # replace NAs from matching with 0s
          is.na(top_value) ~ 0, .default = top_value)) %>%
        dplyr::mutate(score = dplyr::case_when( # and replace 0 values with top values where the number of scores > 0 is below the minimum
          sum(score > 0) < minimum_results ~ top_value,
          .default = score), .by = !!as.name(group_name)) %>%
        dplyr::select(!top_value)

      if (verbose) {
        cat(paste("A minimum of", minimum_results, "results per", group_name,
                  "is returned. This may overwrite the cutoff_value.\n"))
      }
    }

  }

  if (!is.null(normalize_scores)) {
    if (normalize_scores == "doc") {
      # rescale the scores in documents
      classified_documents <- classified_documents %>%
        dplyr::mutate(
          score_norm = scales::rescale(score, to = c(0, 1)),
          .by = !!as.name(doc_id)
        )
    }
    if (normalize_scores == "group") {
      # rescale the scores in groups
      classified_documents <- classified_documents %>%
        dplyr::mutate(
          score_norm = scales::rescale(score, to = c(0, 1)),
          .by = !!as.name(group_name)
        )
    }

    ### set the normalized score 0 to when the score is 0. This prevents documents with 0 in all groups to show as 0.5 in the score_norm
    classified_documents <- classified_documents %>%
      dplyr::mutate(score_norm = dplyr::case_when(score == 0 ~ 0,
                                    .default = score_norm))
  }

  ### apply cutoff after normalization
  if (cutoff_normalized_scores & !is.null(cutoff_value)) { # set scores to 0 for lower quantiles

    if (!is.null(minimum_results)) { # preserve original values for the minimum results
      top_values <- classified_documents %>%
        dplyr::slice_max(score_norm, n = minimum_results,
                         by = !!as.name(group_name)) %>%
        dplyr::rename(top_value = score_norm) %>%
        dplyr::select({{doc_id}}, {{group_name}}, top_value)
    }

    if (cutoff_quantile){

      quantile <- stats::quantile(classified_documents$score_norm,
                                  cutoff_value)[[1]]
      classified_documents <- classified_documents %>%
        dplyr::mutate(score_norm = dplyr::case_when(score_norm < quantile ~ 0,
                                                    .default = score_norm))

      if (verbose) {
        cat(paste0("Setting normalized score-values below ",
                   quantile, " (", cutoff_value,
                   " quantile) to 0. Non-normalized scores are set to 0 in accordance.\n"))
      }
    } else {
      classified_documents <- classified_documents %>%
        dplyr::mutate(score_norm = dplyr::case_when(score_norm < cutoff_value ~ 0,
                                                    .default = score_norm))
      if (verbose) {
        cat(paste("Setting normalized score-values below",
                  cutoff_value,
                  "to 0. Non-normalized scores are set to 0 in accordance.\n"))
      }
    }

    ### set non-normalized score to 0 where score_norm is 0
    classified_documents <- classified_documents %>%
      dplyr::mutate(score = case_when(score_norm == 0 ~ 0, .default = score))

    if (!is.null(minimum_results)) {
      classified_documents <- classified_documents %>%
        dplyr::left_join(top_values, by = c(doc_id, group_name)) %>%
        dplyr::mutate(top_value = dplyr::case_when( # replace NAs from matching with 0s
          is.na(top_value) ~ 0, .default = top_value)) %>%
        dplyr::mutate(score_norm = dplyr::case_when( # and replace 0 values with top values where the number of scores > 0 is below the minimum
          sum(score_norm > 0) < minimum_results ~ top_value,
          .default = score_norm), .by = !!as.name(group_name)) %>%
        dplyr::select(!top_value)

      if (verbose) {
        cat(paste("A minimum of", minimum_results, "results per", group_name,
                  "is returned. This may overwrite the cutoff_value.\n"))
      }
    }

  }

  ## report unclassified documents
  if (verbose | return_unclassified_docs) {
    unclassified_documents <- document_tokens %>%
      dplyr::anti_join(classified_documents,
                       by = dplyr::join_by(!!as.name(doc_id))) %>%
      dplyr::distinct(!!as.name(doc_id))

    if (verbose) {
      unclassified <- unclassified_documents %>% nrow()
      total <- document_tokens %>% distinct(!!as.name(doc_id)) %>% nrow()
      cat(paste(unclassified,
                "out of",
                total,
                "documents could not be classified",
                paste0("(", scales::percent(unclassified/total,
                                            accuracy = 0.01), ")"),
                "\n"))
    }
  }

  ## return the result
  if (return_walk_terms | return_unclassified_docs) {

    out <- list()

    out$classified_documents <- classified_documents

    if (return_walk_terms) { # return the processed and formatted walk_terms
      classification_terms_out <- classification_terms %>%
        dplyr::distinct(!!as.name(tokens_var),
                        !!as.name(group_name),
                        !!as.name(classification_measure))

      if (stringr::str_detect(classification_measure, "Mean")) {
        classification_terms_out <- classification_terms_out %>%
          dplyr::left_join(walk_terms %>% # add seed_term indicator
                           dplyr::distinct(!!as.name(group_name),
                                           NodeNames, seed_term),
                         by = dplyr::join_by(!!as.name(group_name),
                                             !!as.name(tokens_var) == NodeNames))
      } else { # if means were calculated during the process, we report the unprocessed values
        classification_terms_out <- classification_terms_out %>%
          dplyr::left_join(walk_terms %>%
                             dplyr::distinct(!!as.name(group_name),
                                             NodeNames, seed_term,
                                             !!as.name(classification_measure)) %>%
                             rename_with(~ paste0(classification_measure,
                                           "_unprocessed"),
                                         all_of(classification_measure)),
                           by = dplyr::join_by(!!as.name(group_name),
                                               !!as.name(tokens_var) == NodeNames))
      }

      out$walk_terms <- classification_terms_out %>%
        dplyr::arrange(!!as.name(group_name),
                       dplyr::desc(!!as.name(classification_measure))) %>%
        dplyr::select(!!as.name(tokens_var),
                      !!as.name(group_name),
                      seed_term,
                      !!as.name(classification_measure),
                      everything())

  }

    if (return_unclassified_docs) {
      out$unclassified_documents <- unclassified_documents
    }

    return(out)

  } else {
    return(classified_documents)
  }

}


top_group_terms <- function( # a function to print the top terms for each group to check results
    classification_result, # the result of classify_documents(). requires the walk_terms data
    group_name, # the name of the group variable, e.g. "policy_field"
    classification_measure, # the classification measure to be used for selecting the top terms
    print_seed_terms = TRUE,
    n = 20, # the number of terms to print
    with_ties = TRUE, # tie handling (see dplyr::slice_max)
    mode = c("print", "return") # should the results be printed out a dataframe of results be returned?
){

  # some checks
  rlang::arg_match(mode)

  if (!("walk_terms" %in% names(classification_result))) {
    stop("Requires walk terms within the classifcation_result. Specify return_walk_terms = TRUE in classify_documents() to obtain them.")
  }

  if (!(group_name %in% names(classification_result$walk_terms))) {
    stop("Group name not found in the walk_terms data provided.")
  }

  if (!(classification_measure %in% names(classification_result$walk_terms))) {
    stop("Classification Measure not found in the walk_terms data provided.")
  }

  if (print_seed_terms && !("seed_term" %in% names(classification_result$walk_terms))) {
    warning("No seed_term indicator found the data provided.")
    print_seed_terms <- TRUE # if no seed term indicator is found, the filtering step will be skipped without throwing an error
  }

  # Filter out Seed Terms if desired
  if (print_seed_terms == FALSE) {
    walk_terms <- classification_result %>% .[["walk_terms"]] %>%
      dplyr::filter(seed_term == FALSE)
  } else {
    walk_terms <- classification_result %>% .[["walk_terms"]]
  }

  # Printout
  if(mode == "print") {
    cat(paste("\nTop", n, "Terms per", group_name, "\n"))

  walk_terms %>%
    data.table::as.data.table() %>% split(by = group_name) %>%
    purrr::iwalk(\(group, name)
          {cat(paste0("\n", group_name, " ", name, ":\n"))
            group %>% dplyr::slice_max(order_by = !!as.name(classification_measure),
                                n = n, with_ties = with_ties) %>%
              dplyr::select(!dplyr::any_of(group_name)) %>% print()})}

  # Return data
  if (mode == "return") {
    walk_terms %>%
      dplyr::slice_max(order_by = !!as.name(classification_measure),
                       n = n, with_ties = with_ties,
                       by = !!as.name(group_name)) %>%
      return()
  }
}


top_group_documents <- function( # a function to print the top documents for each group to check results
  classification_result, # the result of classify_documents(). requires the walk_terms data
  documents, # the full document data to be matched to the classification result for printout
  doc_id, # the name of the doc_id used for matching. Can be a join_by() function where classification_result = a and documents = b
  group_name, # the name of the group variable, e.g. "policy_field"
  classification_score, # the classification score to be used for selecting the top documents
  n = 20, # the number of documents to print per group
  with_ties = TRUE, # tie handling (see dplyr::slice_max)
  mode = c("print", "return") # should the results be printed out a dataframe of results be returned?
){

  # some checks

  rlang::arg_match(mode)

  if ("classified_documents" %in% names(classification_result)) {
    classified_documents <- classification_result$classified_documents
  } else { # if only the classified documents are provided (e.g. because no additional data was generated by classify_documents()), the function can handle this data
    classified_documents <- classification_result
  }

  if (is.character(doc_id) && !(doc_id %in% names(classified_documents))) {
    stop(paste(doc_id, "not found in the classification_results provided."))
  }

  if (is.character(doc_id) && !(doc_id %in% names(documents))) {
    stop(paste(doc_id, "not found in the documents data provided."))
  }

  if (!(group_name %in% names(classified_documents))) {
    stop("Group name not found in the walk_terms data provided.")
  }

  if (!(classification_score %in% names(classified_documents))) {
    stop("Classification Measure not found in the walk_terms data provided.")
  }


  # Printout
  if (mode == "print") {
    cat(paste("\nTop", n, "Documents per", group_name, "\n"))

    classified_documents %>%
      data.table::as.data.table() %>% split(by = group_name) %>%
      purrr::iwalk(\(group, name)
                   {cat(paste0("\n", group_name, " ", name, ":\n"))
                     group %>%
                       dplyr::slice_max(order_by = !!as.name(classification_score),
                                        n = n, with_ties = with_ties) %>%
                       dplyr::left_join(documents, by = doc_id) %>%
                       dplyr::select(!dplyr::any_of(group_name)) %>% # drop group name from printout
                       dplyr::relocate({{classification_score}}, # score first
                                       dplyr::everything()) %>%
                       print()
                     cat("\n===========================================\n")})
  }

  # Return results
  if (mode == "return") {
    classified_documents %>%
      dplyr::slice_max(order_by = !!as.name(classification_score),
                       n = n, with_ties = with_ties,
                       by = !!as.name(group_name)) %>%
      dplyr::left_join(documents, by = doc_id) %>%
      dplyr::relocate({{group_name}}, # group and score first
                      {{classification_score}},
                      dplyr::everything()) %>%
      return()
  }

}


get_unclassified_documents <- function( # a function to return a random sample of documents not classified
  classification_result, # the result of classify_documents(). requires the walk_terms data
  documents, # the full document data to be matched to the classification result for printout
  doc_id, # the name of the doc_id used for matching. Can be a join_by() function where classification_result = a and documents = b
  n = 20, # the number of documents to print
  mode = c("print", "return") # should the results be printed out a dataframe of results be returned?
    ) {

  # some checks

  rlang::arg_match(mode)

  if (!("unclassified_documents" %in% names(classification_result))) {
    stop("Requires unclassified documents within the classifcation_result. Specify return_unclassified_docs = TRUE in classify_documents() to obtain them.")
  }

  if (is.character(doc_id) && !(doc_id %in% names(classified_documents))) {
    stop(paste(doc_id, "not found in the classification_results provided."))
  }

  if (is.character(doc_id) && !(doc_id %in% names(documents))) {
    stop(paste(doc_id, "not found in the documents data provided."))
  }

  if (nrow(classification_result$unclassified_documents) > 0){ # check if there are unclassified docs

    if (mode == "print") {
      if (n < nrow(classification_result$unclassified_documents)){
        # return sample...
        cat(paste("\nRandom Sample of", n, "out of",
                  nrow(classification_result$unclassified_documents),
                  "unclassified Documents:\n"))
        classification_result$unclassified_documents %>%
          dplyr::slice_sample(n = n) %>%
          dplyr::left_join(documents, by = doc_id) %>%
          print()
      } else {
        # ... or return all unclassified documents if n_return >= nr of unclassified docs
        cat(paste("\nReturning all",
                  nrow(classification_result$unclassified_documents),
                  "unclassified Documents:\n"))
        classification_result$unclassified_documents %>%
          dplyr::left_join(documents, by = doc_id)  %>%
          print()
      }}

    if (mode == "return") {
      classification_result$unclassified_documents %>%
        dplyr::slice_sample(n = n) %>%
        dplyr::left_join(documents, by = doc_id) %>%
        return()
    }
  }
}