forked from donnekgit/andika
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebtrans.php
130 lines (100 loc) · 4.55 KB
/
webtrans.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
<?php
/*
*********************************************************************
Copyright Kevin Donnelly 2012.
kevindonnelly.org.uk
This file is part of Andika!, a set of tools for writing Swhili in Arbic script..
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License or the GNU
Affero General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
and the GNU Affero General Public License along with this program.
If not, see <http://www.gnu.org/licenses/>.
*********************************************************************
*/
mb_internal_encoding("UTF-8");
include("./includes/fns.php");
// Get the web address.
$webtrans=trim($_POST['webtrans']);
//echo $webtrans."<br />";
// Pass the option not to show sukun if that option has been ticked.
$no_sukun=$_POST['sakani'];
//echo $no_sukun."<br />";
// Check the web address is clean.
for ($i=0; $i<strlen($webtrans); $i++)
{
//$letter=$webtrans[$i];
//echo $letter."\n";
if (!preg_match("%[-a-zA-Z0-9_:./']%u", $webtrans[$i]))
{
echo "<h6>Only alphanumeric characters, full-stops (.), hyphens (-), underscores (_), single quotes ('), colons (:) and slashes (/) are allowed in the web address.</h6>";
exit;
}
}
// IMPORTANT: You need to allow access to the webtrans holding directory.
$newname=basename($webtrans);
//echo $newname."<br />";
// Download the webpage into the webtrans holding directory.
// Escaped quotes handle single quotes in words like "ng'ombe".
// -k (--convert-links) keeps the downloaded links pointing to the original web-based files.
// --no-cache ensures a fresh copy is taken every time, rather than using cached copies.
// -O outputs the download to a file.
system("wget -k --no-cache \"$webtrans\" -O webtrans/\"$newname\"");
// Read the contents into PHP.
$webpage=file_get_contents("webtrans/$newname");
// Format the contents, ready for conversion.
$webpage=format_for_trans($webpage);
// Write out the contents again.
file_put_contents("webtrans/{$newname}_ar.txt", $webpage);
// Get rid of blank lines.
// Escaped quotes handle single quotes in words like "ng'ombe".
exec("sed -i '/^$/d' webtrans/\"{$newname}\"_ar.txt");
// Open the file we'll write the conversion to.
$fp = fopen("webtrans/{$newname}_ar_done.html", "w");
// Read the prepped contents in.
$lines=file("webtrans/{$newname}_ar.txt");
foreach ($lines as $line)
{
if (!preg_match("/^</", $line)) // If the line is not HTML stuff ...
{
// $line=lose_c($line); // Convert c to s/k - stops lines getting messed up when there are English words (eg hyperlink references) in them. UPDATE: Using kaf with three dots instead, which will allow round-tripping.
// Mark penultimate syllables in the standard orthography, and a few other things.
$prepped=prep_rom($line);
//echo $prepped;
$translit=rom2ar($prepped, $no_sukun);
//echo $translit."<br />";
// Show sukun on long waw and yeh if that option has been ticked.
if (isset($_POST['longvowel']))
{
$translit=waw_yeh_sukun($translit);
}
$arabic=html_entity_decode(preg_replace("/U\+([0-9A-F]{4,5})/", "&#x\\1;", $translit), ENT_NOQUOTES, 'UTF-8');
// Convert numbers separately, and only if that option has been ticked.
if (isset($_POST['numbers']))
{
$arabic=preg_replace("/([0-9]+)/e", "convert_numbers('$1')", $arabic);
}
//echo $arabic."\n";
$line="<span style=\"font-family: Scheherazade; font-size: 25px; text-align: right; line-height: 1.5;\">".$arabic."</span>";
fwrite($fp, $line);
}
else // If the line is HTML stuff, just write it out unaltered.
{
fwrite($fp, $line);
}
}
fclose($fp);
// Fix the title of the page.
// Escaped quotes handle single quotes in words like "ng'ombe".
exec("sed -ri 's/<title>/<title>Conversion/' webtrans/\"{$newname}\"_ar_done.html");
// Give two links where the converted and original pages can be opened.
echo "<img alt=\"spinner\" id=\"spinner\" src=\"images/ajax-loader.gif\" style=\"display:none;\" />";
echo "<h6>Click to see the <a href=\"webtrans/{$newname}_ar_done.html\" target=_blank>converted webpage</a> in a new window or tab.</h6>";
echo "<em>(Click to see the <a href=\"$webtrans\" target=_blank> original webpage</a> in a new window or tab.</em>)";
?>