-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathstrtk_wordfreq.cpp
137 lines (111 loc) · 4.08 KB
/
strtk_wordfreq.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/*
*****************************************************************
* String Toolkit Library *
* *
* Word Frequency Calculator *
* Author: Arash Partow (2002-2020) *
* URL: http://www.partow.net/programming/strtk/index.html *
* *
* Copyright notice: *
* Free use of the String Toolkit Library is permitted under the *
* guidelines and in accordance with the most current version of *
* the MIT License. *
* http://www.opensource.org/licenses/MIT *
* *
*****************************************************************
*/
/*
Description: This example demonstrates how one can calculate the
word frequency model for a given piece of text using
the String Toolkit library. Input is taken either
from stdin or a user specified file. Once the text
has been fully processed, the frequency of each word
is then printed to stdout.
*/
#include <cstdio>
#include <iostream>
#include <iterator>
#include <utility>
#include <string>
#include <map>
#include "strtk.hpp"
/*
Note: For performance reasons, if TR1 is available then the
following definition of std::map should be replaced
with std::unordered_map.
*/
typedef std::map<std::string,unsigned int> map_t;
template<typename Predicate>
struct line_parser
{
public:
line_parser(unsigned long long& word_count,
map_t& map,
Predicate& p)
: word_count_(word_count),
map_(map),
p_(p)
{
str_.reserve(32);
}
inline void operator() (const std::string& s)
{
if (s.empty()) return;
strtk::split(p_,s,*this,strtk::split_options::compress_delimiters);
}
inline void operator=(const strtk::std_string::iterator_type& r)
{
if (r.first == r.second) return;
++word_count_;
str_.assign(r.first,r.second);
strtk::convert_to_lowercase(str_);
++map_[str_];
}
inline line_parser& operator++() { return (*this); }
inline line_parser& operator++(int) { return (*this); }
inline line_parser& operator*() { return (*this); }
private:
inline line_parser& operator=(const line_parser&);
unsigned long long& word_count_;
map_t& map_;
Predicate& p_;
std::string str_;
};
int main(int argc, char* argv[])
{
typedef strtk::multiple_char_delimiter_predicate predicate_t;
typedef line_parser<const predicate_t> lp_t;
const std::string delimiters = strtk::ext_string::all_chars()
- strtk::ext_string::all_lowercase_letters()
- strtk::ext_string::all_uppercase_letters();
static const predicate_t predicate(delimiters);
map_t word_list;
unsigned long long word_count = 0;
switch (argc)
{
// Consume input from stdin
case 1 : strtk::for_each_line(std::cin, lp_t(word_count, word_list, predicate));
break;
// Consume input from user specified file
case 2 : strtk::for_each_line(argv[1], lp_t(word_count, word_list, predicate));
break;
default :
{
std::cout << "usage: strtk_wordfreq <file name>" << std::endl;
std::cout << "usage: cat words.txt | strtk_wordfreq" << std::endl;
return 1;
}
}
std::cout << "Word count: " << word_count << std::endl;
std::cout << "Unique word count: " << word_list.size() << std::endl;
map_t::iterator itr = word_list.begin();
while (word_list.end() != itr)
{
printf("%s %10d %10.9f\n",
strtk::text::right_align(15, ' ', itr->first).c_str(),
itr->second,
(1.0 * itr->second) / word_count);
++itr;
}
return 0;
}