use bio::data_structures::rank_select::RankSelect;
use bv::BitVec;
use bv::Bits;
use bv::BitsExt;
use serde::{Deserialize, Serialize};
use std::fmt;
use std::iter::FromIterator;
use crate::WaveletTree;
use std::fmt::Debug;
#[derive(Serialize, Deserialize)]
pub struct WaveletTreeCompact<T: PartialEq + Copy> {
alphabet: Vec<T>,
bit_vec: RankSelect,
sequence_len: u64,
}
impl<T: PartialEq + Copy> WaveletTreeCompact<T> {
pub fn new(input: Vec<T>) -> WaveletTreeCompact<T> {
let sequence_len = input.len() as u64;
let mut alphabet: Vec<T> = Vec::new();
input.iter().for_each(|x| {
if !alphabet.contains(&x) {
alphabet.push(*x);
}
});
let mut levels: Vec<BitVec<u8>> = Vec::new();
if alphabet.len() == 1 {
let mut local_bitvec = BitVec::new();
input.iter().for_each(|_x| {
local_bitvec.push(false);
});
levels.push(local_bitvec);
} else {
WaveletTreeCompact::create_bitvec(0, &mut levels, &input[..], &alphabet[..]);
}
let mut bit_vec: BitVec<u8> = BitVec::new();
for l in levels {
let filler: BitVec<u8> = match l.len() {
0 => BitVec::new(),
x => BitVec::new_fill(false, sequence_len - (x as u64)),
};
bit_vec = bit_vec.bit_concat(l).to_bit_vec();
bit_vec = bit_vec.bit_concat(filler).to_bit_vec();
}
WaveletTreeCompact {
alphabet: alphabet.to_owned(),
bit_vec: RankSelect::new(bit_vec, crate::SUPERBLOCK_SIZE),
sequence_len,
}
}
fn create_bitvec(level: usize, levels: &mut Vec<BitVec<u8>>, sequence: &[T], alphabet: &[T]) {
if alphabet.len() >= 2 {
let (left_alphabet, right_alphabet) = WaveletTreeCompact::split_alphabet(alphabet);
let mut l_seq = Vec::new();
let mut r_seq = Vec::new();
let mut local_bitvec = BitVec::new();
sequence.iter().for_each(|x| {
if left_alphabet.contains(x) {
l_seq.push(*x);
local_bitvec.push(false);
} else {
r_seq.push(*x);
local_bitvec.push(true);
}
});
if !levels.len() > level {
levels.push(BitVec::new());
}
levels[level] = levels[level].bit_concat(local_bitvec).to_bit_vec();
WaveletTreeCompact::create_bitvec(level + 1, levels, &l_seq[..], left_alphabet);
WaveletTreeCompact::create_bitvec(level + 1, levels, &r_seq[..], right_alphabet);
}
}
fn rank_0(&self, l: u64, r: u64) -> Option<u64> {
let offset = if self.bit_vec.bits()[l] {
Some(0)
} else {
Some(1)
};
Some(self.bit_vec.rank_0(r)? - self.bit_vec.rank_0(l)? + offset?)
}
fn rank_1(&self, l: u64, r: u64) -> Option<u64> {
let offset = if self.bit_vec.bits()[l] {
Some(1)
} else {
Some(0)
};
Some(self.bit_vec.rank_1(r)? - self.bit_vec.rank_1(l)? + offset?)
}
fn select_0(&self, l: u64, r: u64, n: u64) -> Option<u64> {
let offset = if self.bit_vec.bits()[l] { 0 } else { 1 };
let rank_lm1 = self.bit_vec.rank_0(l).unwrap() - offset;
let index = self.bit_vec.select_0(n + rank_lm1);
match index {
None => None,
Some(i) => {
if i <= r && i >= l {
Some(i - l)
} else {
None
}
}
}
}
fn select_1(&self, l: u64, r: u64, n: u64) -> Option<u64> {
let offset = if self.bit_vec.bits()[l] { 1 } else { 0 };
let rank_lm1 = self.bit_vec.rank_1(l).unwrap() - offset;
let index = self.bit_vec.select_1(n + rank_lm1);
match index {
None => None,
Some(i) => {
if i <= r && i >= l {
Some(i - l)
} else {
None
}
}
}
}
fn split_alphabet<'a>(alphabet: &'a [T]) -> (&[T], &[T]) {
if alphabet.is_empty() {
panic!("can not split empty alphabet")
} else if alphabet.len() == 1 {
(alphabet, &[])
} else {
alphabet.split_at(2usize.pow(((alphabet.len()) as f64).log2().ceil() as u32 - 1))
}
}
fn access_helper(&self, position: u64, alphabet: &[T], l: u64, r: u64) -> Option<T> {
if alphabet.len() <= 1 {
return Some(alphabet[0]);
}
if l == r {
return None;
}
assert!(l <= r);
let (left_alphabet, right_alphabet) = WaveletTreeCompact::split_alphabet(alphabet);
let pos_rank = self.rank_0(l, r)?;
if self.bit_vec.get(l + position) {
let child_l = self.sequence_len + pos_rank + l;
let child_r = self.sequence_len + r;
let position = self.rank_1(l, l + position)?;
self.access_helper(position - 1, right_alphabet, child_l, child_r)
} else {
let child_l = self.sequence_len + l;
let child_r = child_l + pos_rank - 1;
let position = self.rank_0(l, l + position)?;
self.access_helper(position - 1, left_alphabet, child_l, child_r)
}
}
fn rank_helper(
&self,
alphabet: &[T],
object: T,
index: u64,
left: u64,
right: u64,
) -> Option<u64> {
let (left_alphabet, right_alphabet) = WaveletTreeCompact::split_alphabet(alphabet);
if left_alphabet.contains(&object) {
if left_alphabet.len() == 1 {
self.rank_0(left, left + index)
} else {
if let Some(w) = self.rank_0(left, right) {
let left_child_left = left + self.sequence_len;
let left_child_right = left_child_left + w - 1;
let pos_in_child = self.rank_0(left, left + index);
match pos_in_child {
None => None,
Some(0) => Some(0),
Some(i) => self.rank_helper(
left_alphabet,
object,
i - 1,
left_child_left,
left_child_right,
),
}
} else {
panic!("rank: no child in tree");
}
}
} else if right_alphabet.contains(&object) {
if right_alphabet.len() == 1 {
self.rank_1(left, left + index)
} else {
if let Some(w) = self.rank_0(left, right) {
let left_child_left = left + self.sequence_len;
let left_child_right = left_child_left + w - 1;
let right_child_left = left_child_right + 1;
let right_child_right = right + self.sequence_len;
let pos_in_child = self.rank_1(left, left + index);
match pos_in_child {
None => None,
Some(0) => Some(0),
Some(i) => self.rank_helper(
right_alphabet,
object,
i - 1,
right_child_left,
right_child_right,
),
}
} else {
panic!("rank: no child in tree");
}
}
} else {
None
}
}
fn select_helper(
&self,
alphabet: &[T],
object: T,
index: u64,
left: u64,
right: u64,
) -> Option<u64> {
let (left_alphabet, right_alphabet) = WaveletTreeCompact::split_alphabet(alphabet);
if left_alphabet.contains(&object) {
if left_alphabet.len() == 1 {
self.select_0(left, right, index)
} else {
if let Some(w) = self.rank_0(left, right) {
let left_child_left = left + self.sequence_len;
let left_child_right = left_child_left + w - 1;
match self.select_helper(
left_alphabet,
object,
index,
left_child_left,
left_child_right,
) {
Some(x) => self.select_0(left, right, x + 1),
None => None,
}
} else {
panic!("rank: no child in tree");
}
}
} else if right_alphabet.contains(&object) {
if right_alphabet.len() == 1 {
self.select_1(left, right, index)
} else {
if let Some(w) = self.rank_0(left, right) {
let left_child_left = left + self.sequence_len;
let left_child_right = left_child_left + w - 1;
let right_child_left = left_child_right + 1;
let right_child_right = right + self.sequence_len;
match self.select_helper(
right_alphabet,
object,
index,
right_child_left,
right_child_right,
) {
Some(x) => self.select_1(left, right, x + 1),
None => None,
}
} else {
panic!("rank: no child in tree");
}
}
} else {
None
}
}
}
impl<T: PartialEq + Copy> WaveletTree<T> for WaveletTreeCompact<T> {
fn access(&self, position: u64) -> Option<T> {
if position >= self.sequence_len {
None
} else {
self.access_helper(position, &self.alphabet[..], 0, self.sequence_len - 1)
}
}
fn rank(&self, object: T, n: u64) -> Option<u64> {
if n >= self.sequence_len {
return None;
}
self.rank_helper(&self.alphabet[..], object, n, 0, self.sequence_len - 1)
}
fn select(&self, object: T, n: u64) -> Option<u64> {
if self.sequence_len == 0 {
return None;
}
self.select_helper(&self.alphabet[..], object, n, 0, self.sequence_len - 1)
}
}
impl<T: PartialEq + Copy + Debug> Debug for WaveletTreeCompact<T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"WaveletTreeCompact {{ alphabet:{:?} bv:{:?}}}",
self.alphabet,
self.bit_vec.bits()
)
}
}
impl<T: PartialEq + Copy> PartialEq for WaveletTreeCompact<T> {
fn eq(&self, other: &Self) -> bool {
self.alphabet == other.alphabet && self.bit_vec.bits() == other.bit_vec.bits()
}
}
impl PartialEq<&str> for WaveletTreeCompact<char> {
fn eq(&self, other: &&str) -> bool {
if self.sequence_len as usize == other.chars().count() {
for (i, c) in other.chars().enumerate() {
match self.access(i as u64) {
None => return false,
Some(c2) => {
if c2 != c {
return false;
}
}
}
}
true
} else {
false
}
}
}
impl<T: PartialEq + Copy> PartialEq<Vec<T>> for WaveletTreeCompact<T> {
fn eq(&self, other: &Vec<T>) -> bool {
if self.sequence_len as usize == other.len() {
for (i, c) in other.iter().enumerate() {
match self.access(i as u64) {
None => return false,
Some(c2) => {
if *c != c2 {
return false;
}
}
}
}
true
} else {
false
}
}
}
pub struct TreeIteratorCompact<T: PartialEq + Copy> {
index: usize,
tree: WaveletTreeCompact<T>,
}
impl<T: PartialEq + Copy> Iterator for TreeIteratorCompact<T> {
type Item = T;
fn next(&mut self) -> Option<T> {
let result = self.tree.access(self.index as u64);
self.index += 1;
result
}
}
impl<T: PartialEq + Copy> IntoIterator for WaveletTreeCompact<T> {
type Item = T;
type IntoIter = TreeIteratorCompact<T>;
fn into_iter(self) -> Self::IntoIter {
TreeIteratorCompact {
index: 0,
tree: self,
}
}
}
impl From<String> for WaveletTreeCompact<char> {
fn from(input: String) -> Self {
WaveletTreeCompact::new(input.chars().collect())
}
}
impl From<&str> for WaveletTreeCompact<char> {
fn from(input: &str) -> Self {
WaveletTreeCompact::new(input.chars().collect())
}
}
impl<T: PartialEq + Copy> From<Vec<T>> for WaveletTreeCompact<T> {
fn from(input: Vec<T>) -> Self {
WaveletTreeCompact::new(input)
}
}
impl<T: PartialEq + Copy> FromIterator<T> for WaveletTreeCompact<T> {
fn from_iter<I: IntoIterator<Item = T>>(input: I) -> Self {
WaveletTreeCompact::new(input.into_iter().collect())
}
}
#[cfg(test)]
mod tests {
use super::*;
use unicode_segmentation::UnicodeSegmentation;
#[test]
fn test_new() {
let w_tree = WaveletTreeCompact::new("alabar_a_la_alabarda".chars().collect());
println!("{:?}", w_tree);
}
#[test]
fn test_new_split_alphabet() {
let alphabet: Vec<char> = "abcde".chars().collect();
let (l, r) = WaveletTreeCompact::split_alphabet(&alphabet[..]);
assert!(alphabet.len() == 5);
assert_eq!(l.len(), 4);
assert_eq!(r.len(), 1);
let test_string = "abcde".chars().collect();
let w_tree = WaveletTreeCompact::new(test_string);
assert_eq!(w_tree.bit_vec.bits()[0], false);
assert_eq!(w_tree.bit_vec.bits()[1], false);
assert_eq!(w_tree.bit_vec.bits()[2], false);
assert_eq!(w_tree.bit_vec.bits()[3], false);
assert_eq!(w_tree.bit_vec.bits()[4], true);
assert_eq!(w_tree.bit_vec.bits()[5], false);
assert_eq!(w_tree.bit_vec.bits()[6], false);
assert_eq!(w_tree.bit_vec.bits()[7], true);
assert_eq!(w_tree.bit_vec.bits()[8], true);
assert_eq!(w_tree.bit_vec.bits()[9], false);
assert_eq!(w_tree.bit_vec.bits()[10], false);
assert_eq!(w_tree.bit_vec.bits()[11], true);
assert_eq!(w_tree.bit_vec.bits()[12], false);
assert_eq!(w_tree.bit_vec.bits()[13], true);
assert_eq!(w_tree.bit_vec.bits()[14], false);
}
#[test]
fn test_rank_edge_cases() {
let one_element = WaveletTreeCompact::new("a".chars().collect());
assert_eq!(one_element.rank('a', 0), Some(1));
assert_eq!(one_element.rank('a', 1), None);
assert_eq!(one_element.rank('b', 0), None);
}
#[test]
fn test_rank_select_0_1() {
let test_string = "abcde".chars().collect();
let w_tree = WaveletTreeCompact::new(test_string);
assert_eq!(w_tree.rank_0(0, 3), Some(4));
assert_eq!(w_tree.rank_0(0, 4), Some(4));
assert_eq!(w_tree.rank_0(4, 5), Some(1));
assert_eq!(w_tree.rank_0(4, 4), Some(0));
assert_eq!(w_tree.rank_0(3, 3), Some(1));
assert_eq!(w_tree.rank_1(0, 3), Some(0));
assert_eq!(w_tree.rank_1(4, 5), Some(1));
assert_eq!(w_tree.select_0(0, 3, 2), Some(1));
assert_eq!(w_tree.select_0(3, 5, 2), Some(2));
assert_eq!(w_tree.select_0(7, 8, 1), None);
assert_eq!(w_tree.select_1(7, 8, 1), Some(0));
assert_eq!(w_tree.select_1(2, 5, 3), None);
}
#[test]
fn test_rank_5_letter() {
let test_string = "abcde".chars().collect();
let w_tree = WaveletTreeCompact::new(test_string);
assert_eq!(w_tree.bit_vec.bits().len(), 15);
assert_eq!(w_tree.rank('a', 0), Some(1));
assert_eq!(w_tree.rank('b', 1), Some(1));
assert_eq!(w_tree.rank('c', 2), Some(1));
assert_eq!(w_tree.rank('d', 3), Some(1));
assert_eq!(w_tree.rank('e', 4), Some(1));
}
#[test]
fn test_rank_long_sequence_letters() {
let test_string = "aaaaabbbbbcde".chars().collect();
let w_tree = WaveletTreeCompact::new(test_string);
assert_eq!(w_tree.rank('a', 0), Some(1));
assert_eq!(w_tree.rank('a', 1), Some(2));
assert_eq!(w_tree.rank('a', 2), Some(3));
assert_eq!(w_tree.rank('a', 3), Some(4));
assert_eq!(w_tree.rank('a', 4), Some(5));
assert_eq!(w_tree.rank('a', 5), Some(5));
assert_eq!(w_tree.rank('a', 6), Some(5));
assert_eq!(w_tree.rank('a', 7), Some(5));
}
#[test]
fn test_rank_unicode() {
let test_string = "Hello world, こんにちは世界, Привет, мир";
let test_string = UnicodeSegmentation::graphemes(test_string, true).collect::<Vec<&str>>();
let w_tree: WaveletTreeCompact<&str> = WaveletTreeCompact::new(test_string);
assert_eq!(w_tree.rank("o", 4), Some(1));
assert_eq!(w_tree.rank("世", 32), Some(1));
assert_eq!(w_tree.rank("и", 32), Some(2));
assert_eq!(w_tree.rank("o", 16), Some(2));
assert_eq!(w_tree.rank("世", 16), Some(0));
assert_eq!(w_tree.rank("и", 16), Some(0));
assert_eq!(w_tree.rank("o", 0), Some(0));
assert_eq!(w_tree.rank("世", 0), Some(0));
assert_eq!(w_tree.rank("и", 0), Some(0));
assert_eq!(w_tree.rank("木", 32), None);
}
#[test]
fn test_new_pointer_free() {
use bv::*;
let w_tree = WaveletTreeCompact::from("ababababcd");
let alphabet = vec!['a', 'b', 'c', 'd'];
let bit_vec = bit_vec![
false, false, false, false, false, false, false, false, true, true,
false, true, false, true, false, true, false, true, false, true
];
let bit_vec = RankSelect::new(bit_vec, crate::SUPERBLOCK_SIZE);
let w_tree_expect = WaveletTreeCompact {
alphabet,
bit_vec,
sequence_len: 9,
};
assert_eq!(w_tree, w_tree_expect);
}
#[test]
fn test_access_unicode_string() {
let test_string = "Hello world, こんにちは世界, Привет, мир";
let w_tree = WaveletTreeCompact::from(test_string);
for (i, c) in test_string.chars().enumerate() {
assert_eq!(w_tree.access(i as u64), Some(c));
}
}
#[test]
fn test_access_invalid_pos() {
let test_string = "Hello world";
let w_tree = WaveletTreeCompact::from(test_string);
assert_eq!(w_tree.access(11), None);
assert_eq!(w_tree.access(100), None);
}
#[test]
fn test_access_num_vec() {
let test_vec: Vec<i64> = vec![
1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 10, 5, 3, 6, 3, 6, 2, 7, 4, 8, -3, -6, -3, -10, -6, 2, 8,
3, 7, 42, 1024, 2048, 1024, 3, 6, 8, 3,
];
let w_tree = WaveletTreeCompact::new(test_vec.clone());
for (i, c) in test_vec.iter().enumerate() {
assert_eq!(w_tree.access(i as u64), Some(*c));
}
}
#[test]
fn test_from_string() {
let test_string: String = "Test".to_string();
let w_tree = WaveletTreeCompact::from(test_string.clone());
for (i, c) in test_string.chars().enumerate() {
assert_eq!(w_tree.access(i as u64), Some(c));
}
assert_eq!(w_tree.access(test_string.len() as u64), None);
}
#[test]
fn test_select() {
let test_string = "abcdeb";
let w_tree = WaveletTreeCompact::from(test_string);
assert_eq!(w_tree.select('b', 1), Some(1));
assert_eq!(w_tree.select('b', 2), Some(5));
assert_eq!(w_tree.select('b', 0), None);
}
#[test]
fn test_serialize_deserialize() {
let test_string = "cbacbcbcbbcabcabcabcabbca";
let w_tree = WaveletTreeCompact::from(test_string);
let serialized = serde_json::to_string(&w_tree).unwrap();
let w_tree2: WaveletTreeCompact<char> = serde_json::from_str(&serialized).unwrap();
assert_eq!(w_tree, w_tree2)
}
#[test]
fn test_fail_management() {
let a = WaveletTreeCompact::from("");
assert_eq!(a.access(0), None);
assert_eq!(a.rank('a', 0), None);
assert_eq!(a.select('a', 0), None);
let b = WaveletTreeCompact::from("abc");
assert_eq!(b.access(4), None);
assert_eq!(b.rank('b', 4), None);
assert_eq!(b.select('b', 2), None);
assert_eq!(b.rank('d', 1), None);
assert_eq!(b.select('d', 1), None);
assert_eq!(b.select('a', 0), None);
assert_eq!(b.rank('c', 1), Some(0));
}
#[test]
fn test_partialeq_str() {
let test_string = "Hello world, こんにちは世界, Привет, мир";
let w_tree: WaveletTreeCompact<char> = test_string.into();
assert!(w_tree == test_string);
let test_string_wrong = "Hello worlt, こんにちは世界, Привет, мир";
assert!(w_tree != test_string_wrong);
let test_string_shorter = "Hello world";
assert!(w_tree != test_string_shorter);
}
#[test]
fn test_partialeq_vec() {
let vec = vec![1, 2, 3, 4, 5, 1, 2, 4, 1, 3, 5, 2, 4];
let w_tree: WaveletTreeCompact<i32> = vec.clone().into();
assert!(w_tree == vec);
let vec_wrong = vec![1, 2, 3, 4, 5, 1, 2, 4, 1, 3, 7, 2, 4];
assert!(w_tree != vec_wrong);
let vec_short = vec![1, 2, 3, 4, 5];
assert!(w_tree != vec_short);
}
#[test]
fn test_into_iter() {
let test_str: String = String::from("Hello world");
let w_tree: WaveletTreeCompact<char> = test_str.clone().into();
test_str.chars().eq(w_tree.into_iter());
}
}