ImportMediawiki.php 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620
  1. <?php
  2. /**
  3. * MediaWiki import plugin for phpMyAdmin
  4. */
  5. declare(strict_types=1);
  6. namespace PhpMyAdmin\Plugins\Import;
  7. use PhpMyAdmin\File;
  8. use PhpMyAdmin\Message;
  9. use PhpMyAdmin\Plugins\ImportPlugin;
  10. use PhpMyAdmin\Properties\Plugins\ImportPluginProperties;
  11. use function count;
  12. use function explode;
  13. use function mb_strlen;
  14. use function mb_strpos;
  15. use function mb_substr;
  16. use function preg_match;
  17. use function str_replace;
  18. use function strcmp;
  19. use function strlen;
  20. use function trim;
  21. /**
  22. * Handles the import for the MediaWiki format
  23. */
  24. class ImportMediawiki extends ImportPlugin
  25. {
  26. /**
  27. * Whether to analyze tables
  28. *
  29. * @var bool
  30. */
  31. private $analyze;
  32. public function __construct()
  33. {
  34. parent::__construct();
  35. $this->setProperties();
  36. }
  37. /**
  38. * Sets the import plugin properties.
  39. * Called in the constructor.
  40. *
  41. * @return void
  42. */
  43. protected function setProperties()
  44. {
  45. $this->setAnalyze(false);
  46. if ($GLOBALS['plugin_param'] !== 'table') {
  47. $this->setAnalyze(true);
  48. }
  49. $importPluginProperties = new ImportPluginProperties();
  50. $importPluginProperties->setText(__('MediaWiki Table'));
  51. $importPluginProperties->setExtension('txt');
  52. $importPluginProperties->setMimeType('text/plain');
  53. $importPluginProperties->setOptions([]);
  54. $importPluginProperties->setOptionsText(__('Options'));
  55. $this->properties = $importPluginProperties;
  56. }
  57. /**
  58. * Handles the whole import logic
  59. *
  60. * @param array $sql_data 2-element array with sql data
  61. *
  62. * @return void
  63. */
  64. public function doImport(?File $importHandle = null, array &$sql_data = [])
  65. {
  66. global $error, $timeout_passed, $finished;
  67. // Defaults for parser
  68. // The buffer that will be used to store chunks read from the imported file
  69. $buffer = '';
  70. // Used as storage for the last part of the current chunk data
  71. // Will be appended to the first line of the next chunk, if there is one
  72. $last_chunk_line = '';
  73. // Remembers whether the current buffer line is part of a comment
  74. $inside_comment = false;
  75. // Remembers whether the current buffer line is part of a data comment
  76. $inside_data_comment = false;
  77. // Remembers whether the current buffer line is part of a structure comment
  78. $inside_structure_comment = false;
  79. // MediaWiki only accepts "\n" as row terminator
  80. $mediawiki_new_line = "\n";
  81. // Initialize the name of the current table
  82. $cur_table_name = '';
  83. $cur_temp_table_headers = [];
  84. $cur_temp_table = [];
  85. $in_table_header = false;
  86. while (! $finished && ! $error && ! $timeout_passed) {
  87. $data = $this->import->getNextChunk($importHandle);
  88. if ($data === false) {
  89. // Subtract data we didn't handle yet and stop processing
  90. $GLOBALS['offset'] -= mb_strlen($buffer);
  91. break;
  92. }
  93. if ($data !== true) {
  94. // Append new data to buffer
  95. $buffer = $data;
  96. unset($data);
  97. // Don't parse string if we're not at the end
  98. // and don't have a new line inside
  99. if (mb_strpos($buffer, $mediawiki_new_line) === false) {
  100. continue;
  101. }
  102. }
  103. // Because of reading chunk by chunk, the first line from the buffer
  104. // contains only a portion of an actual line from the imported file.
  105. // Therefore, we have to append it to the last line from the previous
  106. // chunk. If we are at the first chunk, $last_chunk_line should be empty.
  107. $buffer = $last_chunk_line . $buffer;
  108. // Process the buffer line by line
  109. $buffer_lines = explode($mediawiki_new_line, $buffer);
  110. $full_buffer_lines_count = count($buffer_lines);
  111. // If the reading is not finalized, the final line of the current chunk
  112. // will not be complete
  113. if (! $finished) {
  114. $last_chunk_line = $buffer_lines[--$full_buffer_lines_count];
  115. }
  116. for ($line_nr = 0; $line_nr < $full_buffer_lines_count; ++$line_nr) {
  117. $cur_buffer_line = trim($buffer_lines[$line_nr]);
  118. // If the line is empty, go to the next one
  119. if ($cur_buffer_line === '') {
  120. continue;
  121. }
  122. $first_character = $cur_buffer_line[0];
  123. $matches = [];
  124. // Check beginning of comment
  125. if (! strcmp(mb_substr($cur_buffer_line, 0, 4), '<!--')) {
  126. $inside_comment = true;
  127. continue;
  128. }
  129. if ($inside_comment) {
  130. // Check end of comment
  131. if (! strcmp(mb_substr($cur_buffer_line, 0, 4), '-->')
  132. ) {
  133. // Only data comments are closed. The structure comments
  134. // will be closed when a data comment begins (in order to
  135. // skip structure tables)
  136. if ($inside_data_comment) {
  137. $inside_data_comment = false;
  138. }
  139. // End comments that are not related to table structure
  140. if (! $inside_structure_comment) {
  141. $inside_comment = false;
  142. }
  143. } else {
  144. // Check table name
  145. $match_table_name = [];
  146. if (preg_match(
  147. '/^Table data for `(.*)`$/',
  148. $cur_buffer_line,
  149. $match_table_name
  150. )
  151. ) {
  152. $cur_table_name = $match_table_name[1];
  153. $inside_data_comment = true;
  154. $inside_structure_comment
  155. = $this->mngInsideStructComm(
  156. $inside_structure_comment
  157. );
  158. } elseif (preg_match(
  159. '/^Table structure for `(.*)`$/',
  160. $cur_buffer_line,
  161. $match_table_name
  162. )
  163. ) {
  164. // The structure comments will be ignored
  165. $inside_structure_comment = true;
  166. }
  167. }
  168. continue;
  169. }
  170. if (preg_match('/^\{\|(.*)$/', $cur_buffer_line, $matches)) {
  171. // Check start of table
  172. // This will store all the column info on all rows from
  173. // the current table read from the buffer
  174. $cur_temp_table = [];
  175. // Will be used as storage for the current row in the buffer
  176. // Once all its columns are read, it will be added to
  177. // $cur_temp_table and then it will be emptied
  178. $cur_temp_line = [];
  179. // Helps us differentiate the header columns
  180. // from the normal columns
  181. $in_table_header = false;
  182. // End processing because the current line does not
  183. // contain any column information
  184. } elseif (mb_substr($cur_buffer_line, 0, 2) === '|-'
  185. || mb_substr($cur_buffer_line, 0, 2) === '|+'
  186. || mb_substr($cur_buffer_line, 0, 2) === '|}'
  187. ) {
  188. // Check begin row or end table
  189. // Add current line to the values storage
  190. if (! empty($cur_temp_line)) {
  191. // If the current line contains header cells
  192. // ( marked with '!' ),
  193. // it will be marked as table header
  194. if ($in_table_header) {
  195. // Set the header columns
  196. $cur_temp_table_headers = $cur_temp_line;
  197. } else {
  198. // Normal line, add it to the table
  199. $cur_temp_table[] = $cur_temp_line;
  200. }
  201. }
  202. // Empty the temporary buffer
  203. $cur_temp_line = [];
  204. // No more processing required at the end of the table
  205. if (mb_substr($cur_buffer_line, 0, 2) === '|}') {
  206. $current_table = [
  207. $cur_table_name,
  208. $cur_temp_table_headers,
  209. $cur_temp_table,
  210. ];
  211. // Import the current table data into the database
  212. $this->importDataOneTable($current_table, $sql_data);
  213. // Reset table name
  214. $cur_table_name = '';
  215. }
  216. // What's after the row tag is now only attributes
  217. } elseif (($first_character === '|') || ($first_character === '!')) {
  218. // Check cell elements
  219. // Header cells
  220. if ($first_character === '!') {
  221. // Mark as table header, but treat as normal row
  222. $cur_buffer_line = str_replace('!!', '||', $cur_buffer_line);
  223. // Will be used to set $cur_temp_line as table header
  224. $in_table_header = true;
  225. } else {
  226. $in_table_header = false;
  227. }
  228. // Loop through each table cell
  229. $cells = $this->explodeMarkup($cur_buffer_line);
  230. foreach ($cells as $cell) {
  231. $cell = $this->getCellData($cell);
  232. // Delete the beginning of the column, if there is one
  233. $cell = trim($cell);
  234. $col_start_chars = [
  235. '|',
  236. '!',
  237. ];
  238. foreach ($col_start_chars as $col_start_char) {
  239. $cell = $this->getCellContent($cell, $col_start_char);
  240. }
  241. // Add the cell to the row
  242. $cur_temp_line[] = $cell;
  243. }
  244. } else {
  245. // If it's none of the above, then the current line has a bad
  246. // format
  247. $message = Message::error(
  248. __('Invalid format of mediawiki input on line: <br>%s.')
  249. );
  250. $message->addParam($cur_buffer_line);
  251. $error = true;
  252. }
  253. }
  254. }
  255. }
  256. /**
  257. * Imports data from a single table
  258. *
  259. * @param array $table containing all table info:
  260. * <code> $table[0] - string
  261. * containing table name
  262. * $table[1] - array[] of
  263. * table headers $table[2] -
  264. * array[][] of table content
  265. * rows </code>
  266. * @param array $sql_data 2-element array with sql data
  267. *
  268. * @return void
  269. *
  270. * @global bool $analyze whether to scan for column types
  271. */
  272. private function importDataOneTable(array $table, array &$sql_data)
  273. {
  274. $analyze = $this->getAnalyze();
  275. if ($analyze) {
  276. // Set the table name
  277. $this->setTableName($table[0]);
  278. // Set generic names for table headers if they don't exist
  279. $this->setTableHeaders($table[1], $table[2][0]);
  280. // Create the tables array to be used in Import::buildSql()
  281. $tables = [];
  282. $tables[] = [
  283. $table[0],
  284. $table[1],
  285. $table[2],
  286. ];
  287. // Obtain the best-fit MySQL types for each column
  288. $analyses = [];
  289. $analyses[] = $this->import->analyzeTable($tables[0]);
  290. $this->executeImportTables($tables, $analyses, $sql_data);
  291. }
  292. // Commit any possible data in buffers
  293. $this->import->runQuery('', '', $sql_data);
  294. }
  295. /**
  296. * Sets the table name
  297. *
  298. * @param string $table_name reference to the name of the table
  299. *
  300. * @return void
  301. */
  302. private function setTableName(&$table_name)
  303. {
  304. global $dbi;
  305. if (! empty($table_name)) {
  306. return;
  307. }
  308. $result = $dbi->fetchResult('SHOW TABLES');
  309. // todo check if the name below already exists
  310. $table_name = 'TABLE ' . (count($result) + 1);
  311. }
  312. /**
  313. * Set generic names for table headers, if they don't exist
  314. *
  315. * @param array $table_headers reference to the array containing the headers
  316. * of a table
  317. * @param array $table_row array containing the first content row
  318. *
  319. * @return void
  320. */
  321. private function setTableHeaders(array &$table_headers, array $table_row)
  322. {
  323. if (! empty($table_headers)) {
  324. return;
  325. }
  326. // The first table row should contain the number of columns
  327. // If they are not set, generic names will be given (COL 1, COL 2, etc)
  328. $num_cols = count($table_row);
  329. for ($i = 0; $i < $num_cols; ++$i) {
  330. $table_headers[$i] = 'COL ' . ($i + 1);
  331. }
  332. }
  333. /**
  334. * Sets the database name and additional options and calls Import::buildSql()
  335. * Used in PMA_importDataAllTables() and $this->importDataOneTable()
  336. *
  337. * @param array $tables structure:
  338. * array(
  339. * array(table_name, array() column_names, array()()
  340. * rows)
  341. * )
  342. * @param array $analyses structure:
  343. * $analyses = array(
  344. * array(array() column_types, array() column_sizes)
  345. * )
  346. * @param array $sql_data 2-element array with sql data
  347. *
  348. * @return void
  349. *
  350. * @global string $db name of the database to import in
  351. */
  352. private function executeImportTables(array &$tables, array &$analyses, array &$sql_data)
  353. {
  354. global $db;
  355. // $db_name : The currently selected database name, if applicable
  356. // No backquotes
  357. // $options : An associative array of options
  358. [$db_name, $options] = $this->getDbnameAndOptions($db, 'mediawiki_DB');
  359. // Array of SQL strings
  360. // Non-applicable parameters
  361. $create = null;
  362. // Create and execute necessary SQL statements from data
  363. $this->import->buildSql($db_name, $tables, $analyses, $create, $options, $sql_data);
  364. }
  365. /**
  366. * Replaces all instances of the '||' separator between delimiters
  367. * in a given string
  368. *
  369. * @param string $replace the string to be replaced with
  370. * @param string $subject the text to be replaced
  371. *
  372. * @return string with replacements
  373. */
  374. private function delimiterReplace($replace, $subject)
  375. {
  376. // String that will be returned
  377. $cleaned = '';
  378. // Possible states of current character
  379. $inside_tag = false;
  380. $inside_attribute = false;
  381. // Attributes can be declared with either " or '
  382. $start_attribute_character = false;
  383. // The full separator is "||";
  384. // This remembers if the previous character was '|'
  385. $partial_separator = false;
  386. // Parse text char by char
  387. for ($i = 0, $iMax = strlen($subject); $i < $iMax; $i++) {
  388. $cur_char = $subject[$i];
  389. // Check for separators
  390. if ($cur_char === '|') {
  391. // If we're not inside a tag, then this is part of a real separator,
  392. // so we append it to the current segment
  393. if (! $inside_attribute) {
  394. $cleaned .= $cur_char;
  395. if ($partial_separator) {
  396. $inside_tag = false;
  397. $inside_attribute = false;
  398. }
  399. } elseif ($partial_separator) {
  400. // If we are inside a tag, we replace the current char with
  401. // the placeholder and append that to the current segment
  402. $cleaned .= $replace;
  403. }
  404. // If the previous character was also '|', then this ends a
  405. // full separator. If not, this may be the beginning of one
  406. $partial_separator = ! $partial_separator;
  407. } else {
  408. // If we're inside a tag attribute and the current character is
  409. // not '|', but the previous one was, it means that the single '|'
  410. // was not appended, so we append it now
  411. if ($partial_separator && $inside_attribute) {
  412. $cleaned .= '|';
  413. }
  414. // If the char is different from "|", no separator can be formed
  415. $partial_separator = false;
  416. // any other character should be appended to the current segment
  417. $cleaned .= $cur_char;
  418. if ($cur_char === '<' && ! $inside_attribute) {
  419. // start of a tag
  420. $inside_tag = true;
  421. } elseif ($cur_char === '>' && ! $inside_attribute) {
  422. // end of a tag
  423. $inside_tag = false;
  424. } elseif (($cur_char === '"' || $cur_char == "'") && $inside_tag) {
  425. // start or end of an attribute
  426. if (! $inside_attribute) {
  427. $inside_attribute = true;
  428. // remember the attribute`s declaration character (" or ')
  429. $start_attribute_character = $cur_char;
  430. } else {
  431. if ($cur_char == $start_attribute_character) {
  432. $inside_attribute = false;
  433. // unset attribute declaration character
  434. $start_attribute_character = false;
  435. }
  436. }
  437. }
  438. }
  439. }
  440. return $cleaned;
  441. }
  442. /**
  443. * Separates a string into items, similarly to explode
  444. * Uses the '||' separator (which is standard in the mediawiki format)
  445. * and ignores any instances of it inside markup tags
  446. * Used in parsing buffer lines containing data cells
  447. *
  448. * @param string $text text to be split
  449. *
  450. * @return array
  451. */
  452. private function explodeMarkup($text)
  453. {
  454. $separator = '||';
  455. $placeholder = "\x00";
  456. // Remove placeholder instances
  457. $text = str_replace($placeholder, '', $text);
  458. // Replace instances of the separator inside HTML-like
  459. // tags with the placeholder
  460. $cleaned = $this->delimiterReplace($placeholder, $text);
  461. // Explode, then put the replaced separators back in
  462. $items = explode($separator, $cleaned);
  463. foreach ($items as $i => $str) {
  464. $items[$i] = str_replace($placeholder, $separator, $str);
  465. }
  466. return $items;
  467. }
  468. /* ~~~~~~~~~~~~~~~~~~~~ Getters and Setters ~~~~~~~~~~~~~~~~~~~~ */
  469. /**
  470. * Returns true if the table should be analyzed, false otherwise
  471. *
  472. * @return bool
  473. */
  474. private function getAnalyze()
  475. {
  476. return $this->analyze;
  477. }
  478. /**
  479. * Sets to true if the table should be analyzed, false otherwise
  480. *
  481. * @param bool $analyze status
  482. *
  483. * @return void
  484. */
  485. private function setAnalyze($analyze)
  486. {
  487. $this->analyze = $analyze;
  488. }
  489. /**
  490. * Get cell
  491. *
  492. * @param string $cell Cell
  493. *
  494. * @return mixed
  495. */
  496. private function getCellData($cell)
  497. {
  498. // A cell could contain both parameters and data
  499. $cell_data = explode('|', $cell, 2);
  500. // A '|' inside an invalid link should not
  501. // be mistaken as delimiting cell parameters
  502. if (mb_strpos($cell_data[0], '[[') === false) {
  503. return $cell;
  504. }
  505. if (count($cell_data) === 1) {
  506. return $cell_data[0];
  507. }
  508. return $cell_data[1];
  509. }
  510. /**
  511. * Manage $inside_structure_comment
  512. *
  513. * @param bool $inside_structure_comment Value to test
  514. *
  515. * @return bool
  516. */
  517. private function mngInsideStructComm($inside_structure_comment)
  518. {
  519. // End ignoring structure rows
  520. if ($inside_structure_comment) {
  521. $inside_structure_comment = false;
  522. }
  523. return $inside_structure_comment;
  524. }
  525. /**
  526. * Get cell content
  527. *
  528. * @param string $cell Cell
  529. * @param string $col_start_char Start char
  530. *
  531. * @return string
  532. */
  533. private function getCellContent($cell, $col_start_char)
  534. {
  535. if (mb_strpos($cell, $col_start_char) === 0) {
  536. $cell = trim(mb_substr($cell, 1));
  537. }
  538. return $cell;
  539. }
  540. }