/* * Copyright ©2021 Travis McGaha. All rights reserved. Permission is * hereby granted to students registered for University of Washington * CSE 333 for use solely during Spring Quarter 2021 for purposes of * the course. No other use, copying, distribution, or modification * is permitted without prior written consent. Copyrights for * third-party components of this work must be honored. Instructors * interested in reusing these course materials should contact the * author. */ #ifndef HW2_MEMINDEX_H_ #define HW2_MEMINDEX_H_ #include #include "libhw1/HashTable.h" #include "libhw1/LinkedList.h" #include "./DocTable.h" // A word's byte offset from the start of a document. typedef uint32_t DocPositionOffset_t; // A MemIndex is an in-memory inverted index. // // An inverted index is a unidirectional mapping of word to a logical set // of (document, document offset) tuples, indicating all the occurances // of that word within the document corpus. // // Although C is a procedural language, HW1 demonstrated that we could // implement some aspects of object-oriented programming; namely, data // encapsulation (ie, a struct), access restrictions (ie, hiding the struct // definition in the .c), and information hiding (ie, hiding functionality // in the .c). MemIndex is a primitive example of how to do another OOP // feature: inheritance. Specifically, it is a simple example of // implementation inheritance (ie, we reuse methods but add no new fields). // More features, such as dynamic dispatch (also known as "late binding") // and adding new fields would require the creation of a bookkeeping struct // that contained function pointers (to enable dynamic dispatch) and field // offsets (to enable adding fields) -- essentially, a hand-implemented // vtable. See also DocTable for an example of class composition. typedef HashTable MemIndex; // Allocate and return a new MemIndex. The caller takes responsibility for // eventually calling MemIndex_Free to free memory associated with the table. // // Arguments: none. // // Returns: // - the newly-allocated table (never NULL). MemIndex* MemIndex_Allocate(void); // Frees a MemIndex that was previously allocated by MemIndex_Allocate, // including all the structures stored within it. // // Arguments: // - index: a previously-allocated MemIndex. void MemIndex_Free(MemIndex *index); // Returns the number of words contained within the index. // // Arguments: // - index: the MemIndex // // Returns: // - the number of unique words in the index. int MemIndex_NumWords(MemIndex *index); // Adds a "posting list" to the MemIndex. // // A "posting list" is the list of positions within a specific document where // a particular word appears. The postings (ie, positions) are specified as // byte offsets from the beginning of the document (ie, DocPositionOffset_t), // and the passed-in list must be in ascending order. The passed-in word must // not have been previously added to this document. // // Arguments: // - index: the MemIndex to add these postings to // - word: the word that these postings refer to. MemIndex takes ownership // of this argument. // - docid: the document containing these postings // - postings: a non-empty list of byte offsets, in ascending order. // MemIndex takes ownership of this list. void MemIndex_AddPostingList(MemIndex *index, char *word, DocID_t doc_id, LinkedList *postings); // A document that matches a search query. typedef struct { DocID_t doc_id; // a document that matches a search query int rank; // an indicator of the quality of the match } SearchResult; // Processes a query against the MemIndex, returning a list SearchResults. // // The results are a list of SearchResult's, containing all the documents // containing every query word at least once. The results will be returned // in decreasing quality (i.e. sorted by the rank field in decreasing order). // The client is responsible for freeing the returned list of SearchResults; // the appropriate deallocator to pass into LinkedList_Free is stdlib's free(). // // Arguments: // - index: the MemIndex to query // - query: a non-empty array of strings; each string is a single // null-terminated query word, all lower-case. // - querylen: the number of words in the query array. // // Returns: // - NULL: if no matching documents were found // - a non-NULL LinkedList of SearchResult's LinkedList* MemIndex_Search(MemIndex *index, char *query[], int query_len); ////////////////////////////////////////////////////////////////////////////// // The struct stored within our hash table as the HTValue_t. // // For HW3, clients will need to break the MemIndex abstraction by directly // accessing the contents of the MemIndex HashTable. This struct is how // we implement the "logical set of (document, document offset) tuples" // which is discussed at the top of the file; the 'word' field is the logical // key to the inverted index and the 'postings' is its associated value, // represented as a mapping from DocID_t -> LinkedList(DocPositionOffset_t). // // The WordPostings struct owns the memory allocated to both the word and // postings fields. typedef struct { char *word; HashTable *postings; } WordPostings; #endif // HW2_MEMINDEX_H_