-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcut.php
122 lines (101 loc) · 3.27 KB
/
cut.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
<?php
// cutter()
// cuts the input array into specified chunks
// ARGS:
// $textarray: array containing the words of the text in
// individual slots
// $chunksize: the size in words of a chunk
// $shiftsize: the number of words to shift
// $lastprop: the proportion of a chunk the last chunk can be
// RETURN: an array of chunks, where a chunk is a subset of
// the input array indexed by the first and last word number
// in the chunk, each chunk will not necessarily be indexed
// by word number, but will be textual order
function cutter( $textarray, $chunksize, $shiftsize, $lastprop ) {
// set initial chunk
$start = 0;
$end = $chunksize;
// grab the next chunk and add it in if the bounds were not exceeded
while ( $chunk = array_subset( $textarray, $start, $end ) ) {
// create the index of the $start..$end, most of the time,
// if the subset came back having stopped at MAX, we'll
// need the last key in the $chunk array
$index = "$start.." . array_pop( array_keys( $chunk ) );
$chunkarray[$index] = $chunk;
// get new bounds
$start += $shiftsize;
$end += $shiftsize;
}
// determine the min size of the last chunk
// err on the side of too much; better to have a chunk of
// 4 in chunksize 3 than a chunk of 1
$lastsize = ceil( $chunksize * $lastprop );
// find the last chunk
$lastchunk = end( $chunkarray );
// the the size of the last chunk is smaller than allowed,
// merge and reindex
if ( count( $lastchunk ) < $lastsize ) {
// discard the offending chunk
array_pop( $chunkarray );
// get the very final index of the last chunk, last word
$indexend = array_pop( array_keys( $lastchunk ) );
// remove and capture the chunk to append to
$secondlast = array_pop( $chunkarray );
// get the first index of that array and prepend
// that to create the new index to $chunkarray
$index = array_shift( array_keys( $secondlast ) ) . "..$indexend";
// merge the two chunks in order, and stick it on
$newchunk = array_merge( $secondlast, $lastchunk );
$chunkarray[$index] = $newchunk;
}
return $chunkarray;
}
// array_subset()
// dumb PHP doesn't have this function, so this is a hacky
// version that doesn't think about non-numeric keys
// ARGS:
// $array: array indexed by numbers
// $start: index of first element in subset
// $end: index of first element not in array
// RETURN: an array [$start,$end) from $array with
// the same indicies
function array_subset( $array, $start, $end )
{
$MAX = count( $array );
//$subset = array();
for ( $i = $start; $i < $end; $i++ ) {
if ( $i >= $MAX )
break;
$subset[$i] = $array[$i];
}
if ( count( $subset ) == 0 )
return null;
else
return $subset;
}
function count_words( &$textarray )
{
$wordcount = array();
// iterate through the array of words and up the count
foreach ( $textarray as $word )
{
if ( $word == "" )
continue;
$wordcount["$word"] = isset($wordcount["$word"]) ? $wordcount["$word"] + 1 : 1;
}
return $wordcount;
}
function hash_sort( &$hash, $sort ) {
if ( $sort == 'c' ) {
// grab array for word and counts
$word = array_keys( $hash );
$count = array_values( $hash );
// sort the counts, then words in $hash
array_multisort( $count, SORT_DESC, $word, SORT_ASC, $hash );
}
else {
// sort by key, ie. the word name
ksort( $hash );
}
}
?>