ImportMediawiki.class.php 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. <?php
  2. /* vim: set expandtab sw=4 ts=4 sts=4: */
  3. /**
  4. * MediaWiki import plugin for phpMyAdmin
  5. *
  6. * @package PhpMyAdmin-Import
  7. * @subpackage MediaWiki
  8. */
  9. if (! defined('PHPMYADMIN')) {
  10. exit;
  11. }
  12. /* Get the import interface */
  13. require_once 'libraries/plugins/ImportPlugin.class.php';
  14. /**
  15. * Handles the import for the MediaWiki format
  16. *
  17. * @package PhpMyAdmin-Import
  18. * @subpackage MediaWiki
  19. */
  20. class ImportMediawiki extends ImportPlugin
  21. {
  22. /**
  23. * Whether to analyze tables
  24. *
  25. * @var bool
  26. */
  27. private $_analyze;
  28. /**
  29. * Constructor
  30. */
  31. public function __construct()
  32. {
  33. $this->setProperties();
  34. }
  35. /**
  36. * Sets the import plugin properties.
  37. * Called in the constructor.
  38. *
  39. * @return void
  40. */
  41. protected function setProperties()
  42. {
  43. $this->_setAnalyze(false);
  44. if ($GLOBALS['plugin_param'] !== 'table') {
  45. $this->_setAnalyze(true);
  46. }
  47. $props = 'libraries/properties/';
  48. include_once "$props/plugins/ImportPluginProperties.class.php";
  49. $importPluginProperties = new ImportPluginProperties();
  50. $importPluginProperties->setText(__('MediaWiki Table'));
  51. $importPluginProperties->setExtension('txt');
  52. $importPluginProperties->setMimeType('text/plain');
  53. $importPluginProperties->setOptions(array());
  54. $importPluginProperties->setOptionsText(__('Options'));
  55. $this->properties = $importPluginProperties;
  56. }
  57. /**
  58. * This method is called when any PluginManager to which the observer
  59. * is attached calls PluginManager::notify()
  60. *
  61. * @param SplSubject $subject The PluginManager notifying the observer
  62. * of an update.
  63. *
  64. * @return void
  65. */
  66. public function update (SplSubject $subject)
  67. {
  68. }
  69. /**
  70. * Handles the whole import logic
  71. *
  72. * @return void
  73. */
  74. public function doImport()
  75. {
  76. global $error, $timeout_passed, $finished;
  77. // Defaults for parser
  78. // The buffer that will be used to store chunks read from the imported file
  79. $buffer = '';
  80. // Used as storage for the last part of the current chunk data
  81. // Will be appended to the first line of the next chunk, if there is one
  82. $last_chunk_line = '';
  83. // Remembers whether the current buffer line is part of a comment
  84. $inside_comment = false;
  85. // Remembers whether the current buffer line is part of a data comment
  86. $inside_data_comment = false;
  87. // Remembers whether the current buffer line is part of a structure comment
  88. $inside_structure_comment = false;
  89. // MediaWiki only accepts "\n" as row terminator
  90. $mediawiki_new_line = "\n";
  91. // Initialize the name of the current table
  92. $cur_table_name = "";
  93. while (! $finished && ! $error && ! $timeout_passed ) {
  94. $data = PMA_importGetNextChunk();
  95. if ($data === false) {
  96. // Subtract data we didn't handle yet and stop processing
  97. $offset -= strlen($buffer);
  98. break;
  99. } elseif ($data === true) {
  100. // Handle rest of buffer
  101. } else {
  102. // Append new data to buffer
  103. $buffer = $data;
  104. unset($data);
  105. // Don't parse string if we're not at the end
  106. // and don't have a new line inside
  107. if ( strpos($buffer, $mediawiki_new_line) === false ) {
  108. continue;
  109. }
  110. }
  111. // Because of reading chunk by chunk, the first line from the buffer
  112. // contains only a portion of an actual line from the imported file.
  113. // Therefore, we have to append it to the last line from the previous
  114. // chunk. If we are at the first chunk, $last_chunk_line should be empty.
  115. $buffer = $last_chunk_line . $buffer;
  116. // Process the buffer line by line
  117. $buffer_lines = explode($mediawiki_new_line, $buffer);
  118. $full_buffer_lines_count = count($buffer_lines);
  119. // If the reading is not finalised, the final line of the current chunk
  120. // will not be complete
  121. if (! $finished) {
  122. $full_buffer_lines_count -= 1;
  123. $last_chunk_line = $buffer_lines[$full_buffer_lines_count];
  124. }
  125. for ($line_nr = 0; $line_nr < $full_buffer_lines_count; ++ $line_nr) {
  126. $cur_buffer_line = trim($buffer_lines[$line_nr]);
  127. // If the line is empty, go to the next one
  128. if ( $cur_buffer_line === '' ) {
  129. continue;
  130. }
  131. $first_character = $cur_buffer_line[0];
  132. $matches = array();
  133. // Check beginnning of comment
  134. if (! strcmp(substr($cur_buffer_line, 0, 4), "<!--")) {
  135. $inside_comment = true;
  136. continue;
  137. } elseif ($inside_comment) {
  138. // Check end of comment
  139. if (! strcmp(substr($cur_buffer_line, 0, 4), "-->")) {
  140. // Only data comments are closed. The structure comments
  141. // will be closed when a data comment begins (in order to
  142. // skip structure tables)
  143. if ($inside_data_comment) {
  144. $inside_data_comment = false;
  145. }
  146. // End comments that are not related to table structure
  147. if (! $inside_structure_comment) {
  148. $inside_comment = false;
  149. }
  150. } else {
  151. // Check table name
  152. $match_table_name = array();
  153. if (preg_match(
  154. "/^Table data for `(.*)`$/",
  155. $cur_buffer_line,
  156. $match_table_name
  157. )
  158. ) {
  159. $cur_table_name = $match_table_name[1];
  160. $inside_data_comment = true;
  161. // End ignoring structure rows
  162. if ($inside_structure_comment) {
  163. $inside_structure_comment = false;
  164. }
  165. } elseif (preg_match(
  166. "/^Table structure for `(.*)`$/",
  167. $cur_buffer_line,
  168. $match_table_name
  169. )
  170. ) {
  171. // The structure comments will be ignored
  172. $inside_structure_comment = true;
  173. }
  174. }
  175. continue;
  176. } elseif (preg_match('/^\{\|(.*)$/', $cur_buffer_line, $matches)) {
  177. // Check start of table
  178. // This will store all the column info on all rows from
  179. // the current table read from the buffer
  180. $cur_temp_table = array();
  181. // Will be used as storage for the current row in the buffer
  182. // Once all its columns are read, it will be added to
  183. // $cur_temp_table and then it will be emptied
  184. $cur_temp_line = array();
  185. // Helps us differentiate the header columns
  186. // from the normal columns
  187. $in_table_header = false;
  188. // End processing because the current line does not
  189. // contain any column information
  190. } elseif (substr($cur_buffer_line, 0, 2) === '|-'
  191. || substr($cur_buffer_line, 0, 2) === '|+'
  192. || substr($cur_buffer_line, 0, 2) === '|}'
  193. ) {
  194. // Check begin row or end table
  195. // Add current line to the values storage
  196. if (! empty($cur_temp_line)) {
  197. // If the current line contains header cells
  198. // ( marked with '!' ),
  199. // it will be marked as table header
  200. if ( $in_table_header ) {
  201. // Set the header columns
  202. $cur_temp_table_headers = $cur_temp_line;
  203. } else {
  204. // Normal line, add it to the table
  205. $cur_temp_table [] = $cur_temp_line;
  206. }
  207. }
  208. // Empty the temporary buffer
  209. $cur_temp_line = array();
  210. // No more processing required at the end of the table
  211. if (substr($cur_buffer_line, 0, 2) === '|}') {
  212. $current_table = array(
  213. $cur_table_name,
  214. $cur_temp_table_headers,
  215. $cur_temp_table
  216. );
  217. // Import the current table data into the database
  218. $this->_importDataOneTable($current_table);
  219. // Reset table name
  220. $cur_table_name = "";
  221. }
  222. // What's after the row tag is now only attributes
  223. } elseif (($first_character === '|') || ($first_character === '!')) {
  224. // Check cell elements
  225. // Header cells
  226. if ($first_character === '!') {
  227. // Mark as table header, but treat as normal row
  228. $cur_buffer_line = str_replace('!!', '||', $cur_buffer_line);
  229. // Will be used to set $cur_temp_line as table header
  230. $in_table_header = true;
  231. } else {
  232. $in_table_header = false;
  233. }
  234. // Loop through each table cell
  235. $cells = $this->_explodeMarkup($cur_buffer_line);
  236. foreach ($cells as $cell) {
  237. // A cell could contain both parameters and data
  238. $cell_data = explode('|', $cell, 2);
  239. // A '|' inside an invalid link should not
  240. // be mistaken as delimiting cell parameters
  241. if (strpos($cell_data[0], '[[') === true ) {
  242. if (count($cell_data) == 1) {
  243. $cell = $cell_data[0];
  244. } else {
  245. $cell = $cell_data[1];
  246. }
  247. }
  248. // Delete the beginning of the column, if there is one
  249. $cell = trim($cell);
  250. $col_start_chars = array( "|", "!");
  251. foreach ($col_start_chars as $col_start_char) {
  252. if (strpos($cell, $col_start_char) === 0) {
  253. $cell = trim(substr($cell, 1));
  254. }
  255. }
  256. // Add the cell to the row
  257. $cur_temp_line [] = $cell;
  258. } // foreach $cells
  259. } else {
  260. // If it's none of the above, then the current line has a bad
  261. // format
  262. $message = PMA_Message::error(
  263. __('Invalid format of mediawiki input on line: <br />%s.')
  264. );
  265. $message->addParam($cur_buffer_line);
  266. $error = true;
  267. }
  268. } // End treating full buffer lines
  269. } // while - finished parsing buffer
  270. }
  271. /**
  272. * Imports data from a single table
  273. *
  274. * @param array $table containing all table info:
  275. * <code>
  276. * $table[0] - string containing table name
  277. * $table[1] - array[] of table headers
  278. * $table[2] - array[][] of table content rows
  279. * </code>
  280. *
  281. * @global bool $analyze whether to scan for column types
  282. *
  283. * @return void
  284. */
  285. private function _importDataOneTable ($table)
  286. {
  287. $analyze = $this->_getAnalyze();
  288. if ($analyze) {
  289. // Set the table name
  290. $this->_setTableName($table[0]);
  291. // Set generic names for table headers if they don't exist
  292. $this->_setTableHeaders($table[1], $table[2][0]);
  293. // Create the tables array to be used in PMA_buildSQL()
  294. $tables = array();
  295. $tables [] = array($table[0], $table[1], $table[2]);
  296. // Obtain the best-fit MySQL types for each column
  297. $analyses = array();
  298. $analyses [] = PMA_analyzeTable($tables[0]);
  299. $this->_executeImportTables($tables, $analyses);
  300. }
  301. // Commit any possible data in buffers
  302. PMA_importRunQuery();
  303. }
  304. /**
  305. * Sets the table name
  306. *
  307. * @param string &$table_name reference to the name of the table
  308. *
  309. * @return void
  310. */
  311. private function _setTableName(&$table_name)
  312. {
  313. if (empty($table_name)) {
  314. $result = PMA_DBI_fetch_result('SHOW TABLES');
  315. // todo check if the name below already exists
  316. $table_name = 'TABLE '.(count($result) + 1);
  317. }
  318. }
  319. /**
  320. * Set generic names for table headers, if they don't exist
  321. *
  322. * @param array &$table_headers reference to the array containing the headers
  323. * of a table
  324. * @param array $table_row array containing the first content row
  325. *
  326. * @return void
  327. */
  328. private function _setTableHeaders(&$table_headers, $table_row)
  329. {
  330. if (empty($table_headers)) {
  331. // The first table row should contain the number of columns
  332. // If they are not set, generic names will be given (COL 1, COL 2, etc)
  333. $num_cols = count($table_row);
  334. for ($i = 0; $i < $num_cols; ++ $i) {
  335. $table_headers [$i] = 'COL '. ($i + 1);
  336. }
  337. }
  338. }
  339. /**
  340. * Sets the database name and additional options and calls PMA_buildSQL()
  341. * Used in PMA_importDataAllTables() and $this->_importDataOneTable()
  342. *
  343. * @param array &$tables structure:
  344. * array(
  345. * array(table_name, array() column_names, array()() rows)
  346. * )
  347. * @param array &$analyses structure:
  348. * $analyses = array(
  349. * array(array() column_types, array() column_sizes)
  350. * )
  351. *
  352. * @global string $db name of the database to import in
  353. *
  354. * @return void
  355. */
  356. private function _executeImportTables(&$tables, &$analyses)
  357. {
  358. global $db;
  359. // $db_name : The currently selected database name, if applicable
  360. // No backquotes
  361. // $options : An associative array of options
  362. if (strlen($db)) {
  363. $db_name = $db;
  364. $options = array('create_db' => false);
  365. } else {
  366. $db_name = 'mediawiki_DB';
  367. $options = null;
  368. }
  369. // Array of SQL strings
  370. // Non-applicable parameters
  371. $create = null;
  372. // Create and execute necessary SQL statements from data
  373. PMA_buildSQL($db_name, $tables, $analyses, $create, $options);
  374. unset($tables);
  375. unset($analyses);
  376. }
  377. /**
  378. * Replaces all instances of the '||' separator between delimiters
  379. * in a given string
  380. *
  381. * @param string $start_delim start delimiter
  382. * @param string $end_delim end delimiter
  383. * @param string $replace the string to be replaced with
  384. * @param string $subject the text to be replaced
  385. *
  386. * @return string with replacements
  387. */
  388. private function _delimiterReplace($start_delim, $end_delim, $replace, $subject)
  389. {
  390. // String that will be returned
  391. $cleaned = "";
  392. // Possible states of current character
  393. $inside_tag = false;
  394. $inside_attribute = false;
  395. // Attributes can be declared with either " or '
  396. $start_attribute_character = false;
  397. // The full separator is "||";
  398. // This rembembers if the previous character was '|'
  399. $partial_separator = false;
  400. // Parse text char by char
  401. for ($i = 0; $i < strlen($subject); $i ++) {
  402. $cur_char = $subject[$i];
  403. // Check for separators
  404. if ($cur_char == '|') {
  405. // If we're not inside a tag, then this is part of a real separator,
  406. // so we append it to the current segment
  407. if (! $inside_attribute) {
  408. $cleaned .= $cur_char;
  409. if ($partial_separator) {
  410. $inside_tag = false;
  411. $inside_attribute = false;
  412. }
  413. } elseif ($partial_separator) {
  414. // If we are inside a tag, we replace the current char with
  415. // the placeholder and append that to the current segment
  416. $cleaned .= $replace;
  417. }
  418. // If the previous character was also '|', then this ends a
  419. // full separator. If not, this may be the beginning of one
  420. $partial_separator = ! $partial_separator;
  421. } else {
  422. // If we're inside a tag attribute and the current character is
  423. // not '|', but the previous one was, it means that the single '|'
  424. // was not appended, so we append it now
  425. if ($partial_separator && $inside_attribute) {
  426. $cleaned .= "|";
  427. }
  428. // If the char is different from "|", no separator can be formed
  429. $partial_separator = false;
  430. // any other character should be appended to the current segment
  431. $cleaned .= $cur_char;
  432. if ($cur_char == '<' && ! $inside_attribute) {
  433. // start of a tag
  434. $inside_tag = true;
  435. } elseif ($cur_char == '>' && ! $inside_attribute) {
  436. // end of a tag
  437. $inside_tag = false;
  438. } elseif (($cur_char == '"' || $cur_char == "'") && $inside_tag) {
  439. // start or end of an attribute
  440. if (! $inside_attribute) {
  441. $inside_attribute = true;
  442. // remember the attribute`s declaration character (" or ')
  443. $start_attribute_character = $cur_char;
  444. } else {
  445. if ($cur_char == $start_attribute_character) {
  446. $inside_attribute = false;
  447. // unset attribute declaration character
  448. $start_attribute_character = false;
  449. }
  450. }
  451. }
  452. }
  453. } // end for each character in $subject
  454. return $cleaned;
  455. }
  456. /**
  457. * Separates a string into items, similarly to explode
  458. * Uses the '||' separator (which is standard in the mediawiki format)
  459. * and ignores any instances of it inside markup tags
  460. * Used in parsing buffer lines containing data cells
  461. *
  462. * @param string $text text to be split
  463. *
  464. * @return array
  465. */
  466. private function _explodeMarkup($text)
  467. {
  468. $separator = "||";
  469. $placeholder = "\x00";
  470. // Remove placeholder instances
  471. $text = str_replace($placeholder, '', $text);
  472. // Replace instances of the separator inside HTML-like
  473. // tags with the placeholder
  474. $cleaned = $this->_delimiterReplace("<", ">", $placeholder, $text);
  475. // Explode, then put the replaced separators back in
  476. $items = explode($separator, $cleaned);
  477. foreach ($items as $i => $str) {
  478. $items[$i] = str_replace($placeholder, $separator, $str);
  479. }
  480. return $items;
  481. }
  482. /* ~~~~~~~~~~~~~~~~~~~~ Getters and Setters ~~~~~~~~~~~~~~~~~~~~ */
  483. /**
  484. * Returns true if the table should be analyzed, false otherwise
  485. *
  486. * @return bool
  487. */
  488. private function _getAnalyze()
  489. {
  490. return $this->_analyze;
  491. }
  492. /**
  493. * Sets to true if the table should be analyzed, false otherwise
  494. *
  495. * @param bool $analyze status
  496. *
  497. * @return void
  498. */
  499. private function _setAnalyze($analyze)
  500. {
  501. $this->_analyze = $analyze;
  502. }
  503. }