Title: | Extractive Summarization of Text with the LexRank Algorithm |
---|---|
Description: | An R implementation of the LexRank algorithm described by G. Erkan and D. R. Radev (2004) <DOI:10.1613/jair.1523>. |
Authors: | Adam Spannbauer [aut, cre], Bryan White [ctb] |
Maintainer: | Adam Spannbauer <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.5.2 |
Built: | 2024-11-01 04:24:22 UTC |
Source: | https://github.com/adamspannbauer/lexrankr |
Bind lexrank scores to a dataframe of sentences or to a dataframe of tokens with sentence ids
bind_lexrank_(tbl, text, doc_id, sent_id = NULL, level = c("sentences", "tokens"), threshold = 0.2, usePageRank = TRUE, damping = 0.85, continuous = FALSE, ...) bind_lexrank(tbl, text, doc_id, sent_id = NULL, level = c("sentences", "tokens"), threshold = 0.2, usePageRank = TRUE, damping = 0.85, continuous = FALSE, ...)
bind_lexrank_(tbl, text, doc_id, sent_id = NULL, level = c("sentences", "tokens"), threshold = 0.2, usePageRank = TRUE, damping = 0.85, continuous = FALSE, ...) bind_lexrank(tbl, text, doc_id, sent_id = NULL, level = c("sentences", "tokens"), threshold = 0.2, usePageRank = TRUE, damping = 0.85, continuous = FALSE, ...)
tbl |
dataframe containing column of sentences to be lexranked |
text |
name of column containing sentences or tokens to be lexranked |
doc_id |
name of column containing document ids corresponding to |
sent_id |
Only needed if |
level |
the parsed level of the text column to be lexranked. i.e. is |
threshold |
The minimum simililarity value a sentence pair must have to be represented in the graph where lexRank is calculated. |
usePageRank |
|
damping |
The damping factor to be passed to page rank algorithm. Ignored if |
continuous |
|
... |
tokenizing options to be passed to lexRankr::tokenize. Ignored if |
A dataframe with an additional column of lexrank scores (column is given name lexrank)
df <- data.frame(doc_id = 1:3, text = c("Testing the system. Second sentence for you.", "System testing the tidy documents df.", "Documents will be parsed and lexranked."), stringsAsFactors = FALSE) ## Not run: library(magrittr) df %>% unnest_sentences(sents, text) %>% bind_lexrank(sents, doc_id, level = "sentences") df %>% unnest_sentences(sents, text) %>% bind_lexrank_("sents", "doc_id", level = "sentences") df <- data.frame(doc_id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3), sent_id = c(1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"), stringsAsFactors = FALSE) df %>% bind_lexrank(tokens, doc_id, sent_id, level = 'tokens') ## End(Not run)
df <- data.frame(doc_id = 1:3, text = c("Testing the system. Second sentence for you.", "System testing the tidy documents df.", "Documents will be parsed and lexranked."), stringsAsFactors = FALSE) ## Not run: library(magrittr) df %>% unnest_sentences(sents, text) %>% bind_lexrank(sents, doc_id, level = "sentences") df %>% unnest_sentences(sents, text) %>% bind_lexrank_("sents", "doc_id", level = "sentences") df <- data.frame(doc_id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3), sent_id = c(1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"), stringsAsFactors = FALSE) df %>% bind_lexrank(tokens, doc_id, sent_id, level = 'tokens') ## End(Not run)
Compute LexRanks from a vector of documents using the page rank algorithm or degree centrality the methods used to compute lexRank are discussed in "LexRank: Graph-based Lexical Centrality as Salience in Text Summarization."
lexRank(text, docId = "create", threshold = 0.2, n = 3, returnTies = TRUE, usePageRank = TRUE, damping = 0.85, continuous = FALSE, sentencesAsDocs = FALSE, removePunc = TRUE, removeNum = TRUE, toLower = TRUE, stemWords = TRUE, rmStopWords = TRUE, Verbose = TRUE)
lexRank(text, docId = "create", threshold = 0.2, n = 3, returnTies = TRUE, usePageRank = TRUE, damping = 0.85, continuous = FALSE, sentencesAsDocs = FALSE, removePunc = TRUE, removeNum = TRUE, toLower = TRUE, stemWords = TRUE, rmStopWords = TRUE, Verbose = TRUE)
text |
A character vector of documents to be cleaned and processed by the LexRank algorithm |
docId |
A vector of document IDs with length equal to the length of |
threshold |
The minimum simil value a sentence pair must have to be represented in the graph where lexRank is calculated. |
n |
The number of sentences to return as the extractive summary. The function will return the top |
returnTies |
|
usePageRank |
|
damping |
The damping factor to be passed to page rank algorithm. Ignored if |
continuous |
|
sentencesAsDocs |
|
removePunc |
|
removeNum |
|
toLower |
|
stemWords |
|
rmStopWords |
|
Verbose |
|
A 2 column dataframe with columns sentenceId
and value
. sentence
contains the ids of the top n
sentences in descending order by value
. value
contains page rank score (if usePageRank==TRUE
) or degree centrality (if usePageRank==FALSE
).
http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html
lexRank(c("This is a test.","Tests are fun.", "Do you think the exam will be hard?","Is an exam the same as a test?", "How many questions are going to be on the exam?"))
lexRank(c("This is a test.","Tests are fun.", "Do you think the exam will be hard?","Is an exam the same as a test?", "How many questions are going to be on the exam?"))
Compute LexRanks from sentence pair similarities using the page rank algorithm or degree centrality the methods used to compute lexRank are discussed in "LexRank: Graph-based Lexical Centrality as Salience in Text Summarization."
lexRankFromSimil(s1, s2, simil, threshold = 0.2, n = 3, returnTies = TRUE, usePageRank = TRUE, damping = 0.85, continuous = FALSE)
lexRankFromSimil(s1, s2, simil, threshold = 0.2, n = 3, returnTies = TRUE, usePageRank = TRUE, damping = 0.85, continuous = FALSE)
s1 |
A character vector of sentence IDs corresponding to the |
s2 |
A character vector of sentence IDs corresponding to the |
simil |
A numeric vector of similarity values that represents the similarity between the sentences represented by the IDs in |
threshold |
The minimum simil value a sentence pair must have to be represented in the graph where lexRank is calculated. |
n |
The number of sentences to return as the extractive summary. The function will return the top |
returnTies |
|
usePageRank |
|
damping |
The damping factor to be passed to page rank algorithm. Ignored if |
continuous |
|
A 2 column dataframe with columns sentenceId
and value
. sentenceId
contains the ids of the top n
sentences in descending order by value
. value
contains page rank score (if usePageRank==TRUE
) or degree centrality (if usePageRank==FALSE
).
http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html
lexRankFromSimil(s1=c("d1_1","d1_1","d1_2"), s2=c("d1_2","d2_1","d2_1"), simil=c(.01,.03,.5))
lexRankFromSimil(s1=c("d1_1","d1_1","d1_2"), s2=c("d1_2","d2_1","d2_1"), simil=c(.01,.03,.5))
Utility to parse sentences from text; created to have a central shared sentence parsing function
sentence_parser(text)
sentence_parser(text)
text |
Character vector to be parsed into sentences |
A list with length equal to 'length(text)'; list elements are character vectors of text parsed with sentence regex
Parse the elements of a character vector into a dataframe of sentences with additional identifiers.
sentenceParse(text, docId = "create")
sentenceParse(text, docId = "create")
text |
Character vector to be parsed into sentences |
docId |
A vector of document IDs with length equal to the length of |
A data frame with 3 columns and n
rows, where n
is the number of sentences found by the routine. Column 1: docId
document id for the sentence. Column 2: sentenceId
sentence id for the sentence. Column 3: sentence
the sentences found in the routine.
sentenceParse("Bill is trying to earn a Ph.D.", "You have to have a 5.0 GPA.") sentenceParse(c("Bill is trying to earn a Ph.D.", "You have to have a 5.0 GPA."), docId=c("d1","d2"))
sentenceParse("Bill is trying to earn a Ph.D.", "You have to have a 5.0 GPA.") sentenceParse(c("Bill is trying to earn a Ph.D.", "You have to have a 5.0 GPA."), docId=c("d1","d2"))
Compute distance between sentences using modified idf cosine distance from "LexRank: Graph-based Lexical Centrality as Salience in Text Summarization". Output can be used as input to lexRankFromSimil
.
sentenceSimil(sentenceId, token, docId = NULL, sentencesAsDocs = FALSE)
sentenceSimil(sentenceId, token, docId = NULL, sentencesAsDocs = FALSE)
sentenceId |
A character vector of sentence IDs corresponding to the |
token |
A character vector of tokens corresponding to the |
docId |
A character vector of document IDs corresponding to the |
sentencesAsDocs |
|
A 3 column dataframe of pairwise distances between sentences. Columns: sent1
(sentence id), sent2
(sentence id), & dist
(distance between sent1
and sent2
).
http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html
sentenceSimil(docId=c("d1","d1","d2","d2"), sentenceId=c("d1_1","d1_1","d2_1","d2_1"), token=c("i", "ran", "jane", "ran"))
sentenceSimil(docId=c("d1","d1","d2","d2"), sentenceId=c("d1_1","d1_1","d2_1","d2_1"), token=c("i", "ran", "jane", "ran"))
Parse a character vector of documents into into both sentences and a clean vector of tokens. The resulting output includes IDs for document and sentence for use in other lexRank
functions.
sentenceTokenParse(text, docId = "create", removePunc = TRUE, removeNum = TRUE, toLower = TRUE, stemWords = TRUE, rmStopWords = TRUE)
sentenceTokenParse(text, docId = "create", removePunc = TRUE, removeNum = TRUE, toLower = TRUE, stemWords = TRUE, rmStopWords = TRUE)
text |
A character vector of documents to be parsed into sentences and tokenized. |
docId |
A character vector of document Ids the same length as |
removePunc |
|
removeNum |
|
toLower |
|
stemWords |
|
rmStopWords |
|
A list of dataframes. The first element of the list returned is the sentences
dataframe; this dataframe has columns docId
, sentenceId
, & sentence
(the actual text of the sentence). The second element of the list returned is the tokens
dataframe; this dataframe has columns docId
, sentenceId
, & token
(the actual text of the token).
sentenceTokenParse(c("Bill is trying to earn a Ph.D.", "You have to have a 5.0 GPA."), docId=c("d1","d2"))
sentenceTokenParse(c("Bill is trying to earn a Ph.D.", "You have to have a 5.0 GPA."), docId=c("d1","d2"))
English stopwords from the SMART information retrieval system (as documented in Appendix 11 of http://jmlr.csail.mit.edu/papers/volume5/lewis04a/)
smart_stopwords
smart_stopwords
a character vector with 571 elements
http://jmlr.csail.mit.edu/papers/volume5/lewis04a/
Tokenize a character vector Parse the elements of a character vector into a list of cleaned tokens.
tokenize(text, removePunc = TRUE, removeNum = TRUE, toLower = TRUE, stemWords = TRUE, rmStopWords = TRUE)
tokenize(text, removePunc = TRUE, removeNum = TRUE, toLower = TRUE, stemWords = TRUE, rmStopWords = TRUE)
text |
The character vector to be tokenized |
removePunc |
|
removeNum |
|
toLower |
|
stemWords |
|
rmStopWords |
|
tokenize("Mr. Feeny said the test would be on Sat. At least I'm 99.9% sure that's what he said.") tokenize("Bill is trying to earn a Ph.D. in his field.", rmStopWords=FALSE)
tokenize("Mr. Feeny said the test would be on Sat. At least I'm 99.9% sure that's what he said.") tokenize("Bill is trying to earn a Ph.D. in his field.", rmStopWords=FALSE)
Split a column of text into sentences
unnest_sentences_(tbl, output, input, doc_id = NULL, output_id = "sent_id", drop = TRUE) unnest_sentences(tbl, output, input, doc_id = NULL, output_id = "sent_id", drop = TRUE)
unnest_sentences_(tbl, output, input, doc_id = NULL, output_id = "sent_id", drop = TRUE) unnest_sentences(tbl, output, input, doc_id = NULL, output_id = "sent_id", drop = TRUE)
tbl |
dataframe containing column of text to be split into sentences |
output |
name of column to be created to store parsed sentences |
input |
name of input column of text to be parsed into sentences |
doc_id |
column of document ids; if not provided it will be assumed that each row is a different document |
output_id |
name of column to be created to store sentence ids |
drop |
whether original input column should get dropped |
A data.frame of parsed sentences and sentence ids
df <- data.frame(doc_id = 1:3, text = c("Testing the system. Second sentence for you.", "System testing the tidy documents df.", "Documents will be parsed and lexranked."), stringsAsFactors=FALSE) unnest_sentences(df, sents, text) unnest_sentences_(df, "sents", "text") ## Not run: library(magrittr) df %>% unnest_sentences(sents, text) ## End(Not run)
df <- data.frame(doc_id = 1:3, text = c("Testing the system. Second sentence for you.", "System testing the tidy documents df.", "Documents will be parsed and lexranked."), stringsAsFactors=FALSE) unnest_sentences(df, sents, text) unnest_sentences_(df, "sents", "text") ## Not run: library(magrittr) df %>% unnest_sentences(sents, text) ## End(Not run)